data: tokenizer: null train_files: ~/data/rlhf/gsm8k/train.parquet val_files: ~/data/rlhf/gsm8k/test.parquet prompt_key: prompt max_prompt_length: 512 max_response_length: 512 train_batch_size: 1024 val_batch_size: 1312 return_raw_input_ids: False # This should be set to true when the tokenizer between policy and rm differs return_raw_chat: False actor_rollout_ref: hybrid_engine: True model: path: ~/models/deepseek-llm-7b-chat external_lib: null override_config: {} enable_gradient_checkpointing: False actor: strategy: megatron # This is for backward-compatibility ppo_mini_batch_size: 256 ppo_micro_batch_size: 64 clip_ratio: 0.2 entropy_coeff: 0.001 ppo_epochs: 1 shuffle: True optim: lr: 1e-6 clip_grad: 1.0 lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program megatron: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug. sequence_parallel: True seed: 1 load_weight: True ref: megatron: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug. sequence_parallel: True seed: 1 load_weight: True param_offload: False log_prob_micro_batch_size: 32 rollout: name: vllm temperature: 1.0 top_k: -1 # 0 for hf rollout, -1 for vllm rollout top_p: 1 prompt_length: ${data.max_prompt_length} # for xperf_gpt response_length: ${data.max_response_length} # for vllm rollout dtype: bfloat16 # should align with FSDP gpu_memory_utilization: 0.5 ignore_eos: False enforce_eager: True free_cache_engine: True load_format: dummy_megatron tensor_model_parallel_size: 2 max_num_batched_tokens: 8192 max_num_seqs: 1024 log_prob_micro_batch_size: 2 # for hf rollout do_sample: True layer_name_map: qkv_layer_name: qkv gate_proj_layer_name: gate_up # number of responses (i.e. num sample times) n: 1 critic: strategy: megatron optim: lr: 1e-5 clip_grad: 1.0 lr_warmup_steps_ratio: 0. # the total steps will be injected during runtime min_lr_ratio: null # only useful for warmup with cosine warmup_style: constant # select from constant/cosine total_training_steps: -1 # must be override by program model: path: ~/models/deepseek-llm-7b-chat tokenizer_path: ${actor_rollout_ref.model.path} override_config: {} external_lib: ${actor_rollout_ref.model.external_lib} enable_gradient_checkpointing: False megatron: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug. sequence_parallel: True seed: 1 load_weight: True ppo_mini_batch_size: ${actor_rollout_ref.actor.ppo_mini_batch_size} ppo_micro_batch_size: 2 ppo_epochs: ${actor_rollout_ref.actor.ppo_epochs} shuffle: ${actor_rollout_ref.actor.shuffle} cliprange_value: 0.5 kl_ctrl: type: fixed kl_coef: 0.001 reward_model: enable: False strategy: megatron megatron: tensor_model_parallel_size: 4 pipeline_model_parallel_size: 1 num_layers_per_virtual_pipeline_stage: null # vpp will hang. need debug. sequence_parallel: True seed: 1 model: input_tokenizer: ${actor_rollout_ref.model.path} # set this to null if the chat template is identical path: ~/models/FsfairX-LLaMA3-RM-v0.1 external_lib: ${actor_rollout_ref.model.external_lib} load_weight: True param_offload: False micro_batch_size: 64 max_length: null algorithm: gamma: 1.0 lam: 1.0 adv_estimator: gae kl_penalty: kl # how to estimate kl divergence kl_ctrl: type: fixed kl_coef: 0.001 trainer: total_epochs: 30 total_training_steps: null project_name: verl_examples experiment_name: gsm8k logger: ['console', 'wandb'] nnodes: 1 n_gpus_per_node: 8 save_freq: -1 test_freq: 2 critic_warmup: 0 default_hdfs_dir: ~/experiments/gsm8k/ppo/${trainer.experiment_name} default_local_dir: checkpoints/${trainer.project_name}/${trainer.experiment_name}