Initial commit
This commit is contained in:
13
verl/third_party/vllm/vllm_v_0_4_2/__init__.py
vendored
Normal file
13
verl/third_party/vllm/vllm_v_0_4_2/__init__.py
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
320
verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
vendored
Normal file
320
verl/third_party/vllm/vllm_v_0_4_2/arg_utils.py
vendored
Normal file
@@ -0,0 +1,320 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
|
||||
|
||||
import os
|
||||
import argparse
|
||||
import dataclasses
|
||||
from dataclasses import dataclass
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import torch.nn as nn
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
from .config import ModelConfig, LoadConfig
|
||||
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, EngineConfig, LoRAConfig, ParallelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, TokenizerPoolConfig, VisionLanguageConfig)
|
||||
from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS
|
||||
from vllm.utils import str_to_int_tuple
|
||||
|
||||
|
||||
def nullable_str(val: str):
|
||||
if not val or val == "None":
|
||||
return None
|
||||
return val
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineArgs:
|
||||
"""Arguments for vLLM engine."""
|
||||
model_hf_config: PretrainedConfig = None
|
||||
skip_tokenizer_init: bool = False
|
||||
served_model_name: Optional[Union[str, List[str]]] = None # TODO
|
||||
download_dir: Optional[str] = None
|
||||
load_format: str = 'auto'
|
||||
dtype: str = 'auto'
|
||||
kv_cache_dtype: str = 'auto'
|
||||
quantization_param_path: Optional[str] = None
|
||||
seed: int = 0
|
||||
max_model_len: Optional[int] = None
|
||||
worker_use_ray: bool = False
|
||||
pipeline_parallel_size: int = 1
|
||||
tensor_parallel_size: int = 1
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
block_size: int = 16
|
||||
enable_prefix_caching: bool = False
|
||||
use_v2_block_manager: bool = False
|
||||
swap_space: int = 4 # GiB
|
||||
gpu_memory_utilization: float = 0.90
|
||||
max_num_batched_tokens: Optional[int] = None
|
||||
max_num_seqs: int = 256
|
||||
max_logprobs: int = 5 # OpenAI default value
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
code_revision: Optional[str] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
enforce_eager: bool = False
|
||||
max_context_len_to_capture: Optional[int] = None
|
||||
max_seq_len_to_capture: int = 8192
|
||||
disable_custom_all_reduce: bool = False
|
||||
tokenizer_pool_size: int = 0
|
||||
tokenizer_pool_type: str = "ray"
|
||||
tokenizer_pool_extra_config: Optional[dict] = None
|
||||
enable_lora: bool = False
|
||||
max_loras: int = 1
|
||||
max_lora_rank: int = 16
|
||||
fully_sharded_loras: bool = False
|
||||
lora_extra_vocab_size: int = 256
|
||||
lora_dtype = 'auto'
|
||||
max_cpu_loras: Optional[int] = None
|
||||
device: str = 'auto'
|
||||
ray_workers_use_nsight: bool = False
|
||||
num_gpu_blocks_override: Optional[int] = None
|
||||
num_lookahead_slots: int = 0
|
||||
model_loader_extra_config: Optional[dict] = None
|
||||
|
||||
# Related to Vision-language models such as llava
|
||||
image_input_type: Optional[str] = None
|
||||
image_token_id: Optional[int] = None
|
||||
image_input_shape: Optional[str] = None
|
||||
image_feature_size: Optional[int] = None
|
||||
scheduler_delay_factor: float = 0.0
|
||||
enable_chunked_prefill: bool = False
|
||||
|
||||
guided_decoding_backend: str = 'outlines'
|
||||
# Speculative decoding configuration.
|
||||
speculative_model: Optional[str] = None
|
||||
num_speculative_tokens: Optional[int] = None
|
||||
speculative_max_model_len: Optional[int] = None
|
||||
ngram_prompt_lookup_max: Optional[int] = None
|
||||
ngram_prompt_lookup_min: Optional[int] = None
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||
"""Shared CLI arguments for vLLM engine."""
|
||||
# Model arguments
|
||||
# TODO(shengguangming): delete the unused args
|
||||
parser.add_argument('--model',
|
||||
type=str,
|
||||
default='facebook/opt-125m',
|
||||
help='name or path of the huggingface model to use')
|
||||
parser.add_argument('--tokenizer',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer,
|
||||
help='name or path of the huggingface tokenizer to use')
|
||||
parser.add_argument('--revision',
|
||||
type=str,
|
||||
default=None,
|
||||
help='the specific model version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument('--tokenizer-revision',
|
||||
type=str,
|
||||
default=None,
|
||||
help='the specific tokenizer version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument('--tokenizer-mode',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer_mode,
|
||||
choices=['auto', 'slow'],
|
||||
help='tokenizer mode. "auto" will use the fast '
|
||||
'tokenizer if available, and "slow" will '
|
||||
'always use the slow tokenizer.')
|
||||
parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
|
||||
parser.add_argument('--download-dir',
|
||||
type=str,
|
||||
default=EngineArgs.download_dir,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of '
|
||||
'huggingface')
|
||||
parser.add_argument('--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
|
||||
help='The format of the model weights to load. '
|
||||
'"auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available. '
|
||||
'"pt" will load the weights in the pytorch bin format. '
|
||||
'"safetensors" will load the weights in the safetensors format. '
|
||||
'"npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading. '
|
||||
'"dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.')
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
default=EngineArgs.dtype,
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--max-model-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='model context length. If unspecified, '
|
||||
'will be automatically derived from the model.')
|
||||
# Parallel arguments
|
||||
parser.add_argument('--worker-use-ray',
|
||||
action='store_true',
|
||||
help='use Ray for distributed serving, will be '
|
||||
'automatically set when using more than 1 GPU')
|
||||
parser.add_argument('--pipeline-parallel-size',
|
||||
'-pp',
|
||||
type=int,
|
||||
default=EngineArgs.pipeline_parallel_size,
|
||||
help='number of pipeline stages')
|
||||
parser.add_argument('--tensor-parallel-size',
|
||||
'-tp',
|
||||
type=int,
|
||||
default=EngineArgs.tensor_parallel_size,
|
||||
help='number of tensor parallel replicas')
|
||||
# KV cache arguments
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=EngineArgs.block_size,
|
||||
choices=[8, 16, 32],
|
||||
help='token block size')
|
||||
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
|
||||
parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
|
||||
parser.add_argument('--swap-space',
|
||||
type=int,
|
||||
default=EngineArgs.swap_space,
|
||||
help='CPU swap space size (GiB) per GPU')
|
||||
parser.add_argument('--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=EngineArgs.gpu_memory_utilization,
|
||||
help='the percentage of GPU memory to be used for'
|
||||
'the model executor')
|
||||
parser.add_argument('--max-num-batched-tokens',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_batched_tokens,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--max-num-seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help='maximum number of sequences per iteration')
|
||||
parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
|
||||
# Quantization settings.
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
type=str,
|
||||
choices=['awq', None],
|
||||
default=None,
|
||||
help='Method used to quantize the weights')
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
|
||||
# Get the list of attributes of this dataclass.
|
||||
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
||||
# Set the attributes from the parsed arguments.
|
||||
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
|
||||
return engine_args
|
||||
|
||||
def create_engine_config(
|
||||
self,
|
||||
) -> EngineConfig:
|
||||
device_config = DeviceConfig(self.device)
|
||||
# NOTE(sgm): we only modify ModelConfig, other configs are import from vllm
|
||||
model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.revision, self.code_revision,
|
||||
self.tokenizer_revision, self.max_model_len, self.quantization,
|
||||
self.quantization_param_path, self.enforce_eager, self.max_context_len_to_capture,
|
||||
self.max_seq_len_to_capture, self.max_logprobs, self.skip_tokenizer_init,
|
||||
self.served_model_name)
|
||||
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization,
|
||||
self.swap_space, self.kv_cache_dtype, self.num_gpu_blocks_override,
|
||||
model_config.get_sliding_window(), self.enable_prefix_caching)
|
||||
parallel_config = ParallelConfig(
|
||||
self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
|
||||
self.max_parallel_loading_workers, self.disable_custom_all_reduce,
|
||||
TokenizerPoolConfig.create_config(
|
||||
self.tokenizer_pool_size,
|
||||
self.tokenizer_pool_type,
|
||||
self.tokenizer_pool_extra_config,
|
||||
), self.ray_workers_use_nsight)
|
||||
|
||||
# Use the world_size set by TORCHRUN
|
||||
world_size = int(os.getenv("WORLD_SIZE", "-1"))
|
||||
assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
|
||||
parallel_config.world_size = world_size
|
||||
|
||||
# TODO: spec config
|
||||
speculative_config = SpeculativeConfig.maybe_create_spec_config(
|
||||
target_model_config=model_config,
|
||||
target_parallel_config=parallel_config,
|
||||
target_dtype=self.dtype,
|
||||
speculative_model=self.speculative_model,
|
||||
num_speculative_tokens=self.num_speculative_tokens,
|
||||
speculative_max_model_len=self.speculative_max_model_len,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
use_v2_block_manager=self.use_v2_block_manager,
|
||||
ngram_prompt_lookup_max=self.ngram_prompt_lookup_max,
|
||||
ngram_prompt_lookup_min=self.ngram_prompt_lookup_min,
|
||||
)
|
||||
|
||||
scheduler_config = SchedulerConfig(
|
||||
self.max_num_batched_tokens,
|
||||
self.max_num_seqs,
|
||||
model_config.max_model_len,
|
||||
self.use_v2_block_manager,
|
||||
num_lookahead_slots=(self.num_lookahead_slots
|
||||
if speculative_config is None else speculative_config.num_lookahead_slots),
|
||||
delay_factor=self.scheduler_delay_factor,
|
||||
enable_chunked_prefill=self.enable_chunked_prefill,
|
||||
)
|
||||
|
||||
lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
|
||||
max_loras=self.max_loras,
|
||||
fully_sharded_loras=self.fully_sharded_loras,
|
||||
lora_extra_vocab_size=self.lora_extra_vocab_size,
|
||||
lora_dtype=self.lora_dtype,
|
||||
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
|
||||
None) if self.enable_lora else None
|
||||
|
||||
load_config = LoadConfig(
|
||||
load_format=self.load_format,
|
||||
download_dir=self.download_dir,
|
||||
model_loader_extra_config=self.model_loader_extra_config,
|
||||
)
|
||||
|
||||
if self.image_input_type:
|
||||
if (not self.image_token_id or not self.image_input_shape or not self.image_feature_size):
|
||||
raise ValueError('Specify `image_token_id`, `image_input_shape` and '
|
||||
'`image_feature_size` together with `image_input_type`.')
|
||||
vision_language_config = VisionLanguageConfig(
|
||||
image_input_type=VisionLanguageConfig.get_image_input_enum_type(self.image_input_type),
|
||||
image_token_id=self.image_token_id,
|
||||
image_input_shape=str_to_int_tuple(self.image_input_shape),
|
||||
image_feature_size=self.image_feature_size,
|
||||
)
|
||||
else:
|
||||
vision_language_config = None
|
||||
|
||||
decoding_config = DecodingConfig(guided_decoding_backend=self.guided_decoding_backend)
|
||||
|
||||
return EngineConfig(model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
decoding_config=decoding_config)
|
||||
200
verl/third_party/vllm/vllm_v_0_4_2/config.py
vendored
Normal file
200
verl/third_party/vllm/vllm_v_0_4_2/config.py
vendored
Normal file
@@ -0,0 +1,200 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
|
||||
|
||||
import enum
|
||||
import json
|
||||
from typing import List, Optional, Union
|
||||
from dataclasses import dataclass, field, fields
|
||||
|
||||
from transformers import PretrainedConfig
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor.layers.quantization import get_quantization_config
|
||||
from vllm.transformers_utils.config import get_hf_text_config
|
||||
from vllm.utils import is_hip
|
||||
# Add for verl
|
||||
from vllm.config import ModelConfig, _get_and_verify_dtype, _get_and_verify_max_len
|
||||
|
||||
GPTQMarlinConfig = get_quantization_config("gptq_marlin")
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_GB = 1 << 30
|
||||
|
||||
|
||||
class ModelConfig(ModelConfig):
|
||||
"""Configuration for the model.
|
||||
|
||||
Args:
|
||||
model: Name or path of the huggingface model to use.
|
||||
tokenizer: Name or path of the huggingface tokenizer to use.
|
||||
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
|
||||
available, and "slow" will always use the slow tokenizer.
|
||||
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
|
||||
downloading the model and tokenizer.
|
||||
download_dir: Directory to download and load the weights, default to the
|
||||
default cache directory of huggingface.
|
||||
load_format: The format of the model weights to load:
|
||||
"auto" will try to load the weights in the safetensors format and
|
||||
fall back to the pytorch bin format if safetensors format is
|
||||
not available.
|
||||
"pt" will load the weights in the pytorch bin format.
|
||||
"safetensors" will load the weights in the safetensors format.
|
||||
"npcache" will load the weights in pytorch format and store
|
||||
a numpy cache to speed up the loading.
|
||||
"dummy" will initialize the weights with random values, which is
|
||||
mainly for profiling.
|
||||
dtype: Data type for model weights and activations. The "auto" option
|
||||
will use FP16 precision for FP32 and FP16 models, and BF16 precision
|
||||
for BF16 models.
|
||||
seed: Random seed for reproducibility.
|
||||
revision: The specific model version to use. It can be a branch name,
|
||||
a tag name, or a commit id. If unspecified, will use the default
|
||||
version.
|
||||
code_revision: The specific revision to use for the model code on
|
||||
Hugging Face Hub. It can be a branch name, a tag name, or a
|
||||
commit id. If unspecified, will use the default version.
|
||||
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use
|
||||
the default version.
|
||||
max_model_len: Maximum length of a sequence (including prompt and
|
||||
output). If None, will be derived from the model.
|
||||
quantization: Quantization method that was used to quantize the model
|
||||
weights. If None, we assume the model weights are not quantized.
|
||||
quantization_param_path: Path to JSON file containing scaling factors.
|
||||
Used to load KV cache scaling factors into the model when KV cache
|
||||
type is FP8_E4M3 on ROCm (AMD GPU). In the future these will also
|
||||
be used to load activation and weight scaling factors when the
|
||||
model dtype is FP8_E4M3 on ROCm.
|
||||
enforce_eager: Whether to enforce eager execution. If True, we will
|
||||
disable CUDA graph and always execute the model in eager mode.
|
||||
If False, we will use CUDA graph and eager execution in hybrid.
|
||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
||||
When a sequence has context length larger than this, we fall back
|
||||
to eager mode (DEPRECATED. Use max_seq_len_to_capture instead).
|
||||
max_seq_len_to_capture: Maximum sequence len covered by CUDA graphs.
|
||||
When a sequence has context length larger than this, we fall back
|
||||
to eager mode
|
||||
skip_tokenizer_init: If true, skip initialization of tokenizer and
|
||||
detokenizer.
|
||||
served_model_name: The model name used in metrics tag `model_name`,
|
||||
matches the model name exposed via the APIs. If multiple model
|
||||
names provided, the first name will be used. If not specified,
|
||||
the model name will be the same as `model`.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
dtype: str,
|
||||
seed: int,
|
||||
revision: Optional[str] = None,
|
||||
code_revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
max_model_len: Optional[int] = None,
|
||||
quantization: Optional[str] = None,
|
||||
quantization_param_path: Optional[str] = None,
|
||||
enforce_eager: bool = False,
|
||||
max_context_len_to_capture: Optional[int] = None,
|
||||
max_seq_len_to_capture: Optional[int] = None,
|
||||
max_logprobs: int = 5,
|
||||
skip_tokenizer_init: bool = False,
|
||||
served_model_name: Optional[Union[str, List[str]]] = None,
|
||||
) -> None:
|
||||
self.model = hf_config._name_or_path
|
||||
self.tokenizer = hf_config._name_or_path
|
||||
self.seed = seed
|
||||
self.revision = revision
|
||||
self.code_revision = code_revision
|
||||
self.tokenizer_revision = tokenizer_revision
|
||||
self.quantization = quantization
|
||||
self.quantization_param_path = quantization_param_path
|
||||
self.enforce_eager = enforce_eager
|
||||
self.max_context_len_to_capture = max_context_len_to_capture
|
||||
if self.max_context_len_to_capture is not None:
|
||||
raise ValueError("`max_context_len_to_capture` is deprecated. "
|
||||
"Use `max_seq_len_to_capture` instead.")
|
||||
self.max_seq_len_to_capture = (max_seq_len_to_capture or max_context_len_to_capture)
|
||||
self.max_logprobs = max_logprobs
|
||||
self.skip_tokenizer_init = skip_tokenizer_init
|
||||
|
||||
# self.hf_config = get_config(model, trust_remote_code, revision)
|
||||
self.hf_config = hf_config
|
||||
self.hf_text_config = get_hf_text_config(hf_config)
|
||||
# TODO: for multimodal model
|
||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||
self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
|
||||
# self.served_model_name = get_served_model_name(model,
|
||||
# served_model_name)
|
||||
# self._verify_load_format()
|
||||
# self._verify_tokenizer_mode()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
|
||||
|
||||
class LoadFormat(str, enum.Enum):
|
||||
AUTO = 'auto'
|
||||
MEGATRON = "megatron"
|
||||
HF = "hf"
|
||||
DTENSOR = 'dtensor'
|
||||
DUMMY_HF = 'dummy_hf'
|
||||
DUMMY_MEGATRON = 'dummy_megatron'
|
||||
DUMMY_DTENSOR = 'dummy_dtensor'
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoadConfig:
|
||||
"""
|
||||
download_dir: Directory to download and load the weights, default to the
|
||||
default cache directory of huggingface.
|
||||
load_format: The format of the model weights to load:
|
||||
"auto" will try to load the weights in the safetensors format and
|
||||
fall back to the pytorch bin format if safetensors format is
|
||||
not available.
|
||||
"pt" will load the weights in the pytorch bin format.
|
||||
"safetensors" will load the weights in the safetensors format.
|
||||
"npcache" will load the weights in pytorch format and store
|
||||
a numpy cache to speed up the loading.
|
||||
"dummy" will initialize the weights with random values, which is
|
||||
mainly for profiling.
|
||||
"tensorizer" will use CoreWeave's tensorizer library for
|
||||
fast weight loading.
|
||||
"""
|
||||
|
||||
load_format: Union[str, LoadFormat, "BaseModelLoader"] = LoadFormat.AUTO
|
||||
download_dir: Optional[str] = None
|
||||
model_loader_extra_config: Optional[Union[str, dict]] = field(default_factory=dict)
|
||||
|
||||
def __post_init__(self):
|
||||
model_loader_extra_config = self.model_loader_extra_config or {}
|
||||
if isinstance(model_loader_extra_config, str):
|
||||
self.model_loader_extra_config = json.loads(model_loader_extra_config)
|
||||
self._verify_load_format()
|
||||
|
||||
def _verify_load_format(self) -> None:
|
||||
if not isinstance(self.load_format, str):
|
||||
return
|
||||
|
||||
load_format = self.load_format.lower()
|
||||
self.load_format = LoadFormat(load_format)
|
||||
|
||||
rocm_not_supported_load_format: List[str] = []
|
||||
if is_hip() and load_format in rocm_not_supported_load_format:
|
||||
rocm_supported_load_format = [
|
||||
f for f in LoadFormat.__members__ if (f not in rocm_not_supported_load_format)
|
||||
]
|
||||
raise ValueError(f"load format '{load_format}' is not supported in ROCm. "
|
||||
f"Supported load formats are "
|
||||
f"{rocm_supported_load_format}")
|
||||
269
verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
vendored
Normal file
269
verl/third_party/vllm/vllm_v_0_4_2/dtensor_weight_loaders.py
vendored
Normal file
@@ -0,0 +1,269 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
|
||||
|
||||
from typing import Dict, Iterable, Tuple
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from torch.distributed._tensor import DTensor, Shard, Replicate
|
||||
|
||||
from vllm.model_executor.layers.linear import *
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
|
||||
def gemma_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
continue
|
||||
stacked_name = name.replace(shard_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if stacked_name.endswith(".bias") and stacked_name not in params_dict:
|
||||
continue
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[stacked_name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
# GemmaRMSNorm is different from Llama's in that it multiplies
|
||||
# (1 + weight) to the output, instead of just weight.
|
||||
if "norm.weight" in name:
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
|
||||
norm_weight = local_loaded_weight + 1.0
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, norm_weight.to(dtype=param.dtype))
|
||||
else:
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
|
||||
|
||||
|
||||
def gptbigcode_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
if ".attn.bias" in name:
|
||||
# Skip attention mask.
|
||||
# NOTE: "c_attn.bias" should not be skipped.
|
||||
continue
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
|
||||
|
||||
|
||||
def starcoder2_dtensor_load_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
]
|
||||
|
||||
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
|
||||
break
|
||||
else:
|
||||
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
|
||||
|
||||
|
||||
def llama_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
(".qkv_proj", ".q_proj", "q"),
|
||||
(".qkv_proj", ".k_proj", "k"),
|
||||
(".qkv_proj", ".v_proj", "v"),
|
||||
(".gate_up_proj", ".gate_proj", 0),
|
||||
(".gate_up_proj", ".up_proj", 1),
|
||||
]
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
if ("rotary_emb.cos_cached" in name or "rotary_emb.sin_cached" in name):
|
||||
# Models trained using ColossalAI may include these tensors in
|
||||
# the checkpoint. Skip them.
|
||||
continue
|
||||
# With tie_word_embeddings, we can skip lm_head.weight
|
||||
# The weight might appear unnecessarily in the files if the model is
|
||||
# processed with quantization, LoRA, fine-tuning, etc.
|
||||
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
|
||||
continue
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight)
|
||||
|
||||
|
||||
def qwen2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
if vllm_model.config.tie_word_embeddings and "lm_head.weight" in name:
|
||||
continue
|
||||
for (param_name, weight_name, shard_id) in stacked_params_mapping:
|
||||
if weight_name not in name:
|
||||
continue
|
||||
name = name.replace(weight_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype), shard_id)
|
||||
break
|
||||
else:
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
local_loaded_weight = redistribute_dtensor(param_name=name, loaded_weights=loaded_weight)
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, local_loaded_weight.to(dtype=param.dtype))
|
||||
|
||||
|
||||
def gpt2_dtensor_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
pass
|
||||
|
||||
|
||||
def redistribute_dtensor(param_name: str, loaded_weights: DTensor, parallelize_plan: Dict = None):
|
||||
param_name = _process_parameter_names(name=param_name)
|
||||
if parallelize_plan is not None:
|
||||
assert param_name in parallelize_plan.keys(), \
|
||||
f"param name: {param_name} not in parallelize_plan :{parallelize_plan.keys()}"
|
||||
placement = parallelize_plan[param_name]
|
||||
local_loaded_weights = loaded_weights.redistribute(device_mesh=loaded_weights.device_mesh,
|
||||
placements=placement).to_local()
|
||||
else:
|
||||
local_loaded_weights = loaded_weights.full_tensor()
|
||||
return local_loaded_weights
|
||||
|
||||
|
||||
def _process_parameter_names(name):
|
||||
# Remove '.weight' if it exists at the end of the string
|
||||
if name.endswith(".weight"):
|
||||
name = name[:-7]
|
||||
|
||||
# Remove 'model.layers.x.' or 'model.' prefix
|
||||
if "model.layers" in name:
|
||||
parts = name.split('.')
|
||||
# Reconstruct the string without 'model.layers.x.'
|
||||
name = '.'.join(parts[3:]) # parts[0] is 'model', parts[1] is 'layers', parts[2] is 'x'
|
||||
elif name.startswith("model."):
|
||||
name = name[6:] # Remove 'model.'
|
||||
|
||||
return name
|
||||
|
||||
|
||||
__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__ = {
|
||||
'GPT2LMHeadModel': gpt2_dtensor_weight_loader,
|
||||
'LlamaForCausalLM': llama_dtensor_weight_loader,
|
||||
'LLaMAForCausalLM': llama_dtensor_weight_loader,
|
||||
'MistralForCausalLM': llama_dtensor_weight_loader, # mistral is the same as llama in vLLM
|
||||
'InternLMForCausalLM': llama_dtensor_weight_loader,
|
||||
'AquilaModel': llama_dtensor_weight_loader,
|
||||
'AquilaForCausalLM': llama_dtensor_weight_loader,
|
||||
'Phi3ForCausalLM': llama_dtensor_weight_loader,
|
||||
'GemmaForCausalLM': gemma_dtensor_weight_loader,
|
||||
'GPTBigCodeForCausalLM': gptbigcode_dtensor_load_weights,
|
||||
'Starcoder2ForCausalLM': starcoder2_dtensor_load_weights,
|
||||
'Qwen2ForCausalLM': qwen2_dtensor_weight_loader
|
||||
}
|
||||
|
||||
|
||||
# the actor model is .state_dict()
|
||||
# Load dtensor weights
|
||||
def load_dtensor_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
|
||||
weight_loader(actor_weights, vllm_model)
|
||||
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
|
||||
# after init, and we need this after sync model weights for in first iter.
|
||||
vllm_model = vllm_model.cuda()
|
||||
|
||||
|
||||
def _get_model_weight_loader(arch: str):
|
||||
if arch in __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__:
|
||||
return __MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__[arch]
|
||||
raise ValueError(f"Model architectures {arch} are not supported for now. "
|
||||
f"Supported architectures: {__MODEL_DTENSOR_WEIGHT_LOADER_REGISTRY__.keys()}")
|
||||
|
||||
|
||||
# NOTE(sgm): we use per-parameter weight loader in each vllm sub
|
||||
def update_dtensor_weight_loader():
|
||||
pass
|
||||
91
verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
vendored
Normal file
91
verl/third_party/vllm/vllm_v_0_4_2/hf_weight_loader.py
vendored
Normal file
@@ -0,0 +1,91 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
|
||||
|
||||
from typing import Dict, Union, Optional, Iterable, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.model_loader.weight_utils import default_weight_loader
|
||||
|
||||
|
||||
def update_hf_weight_loader():
|
||||
from vllm.model_executor.models.gemma import GemmaForCausalLM
|
||||
GemmaForCausalLM.load_weights = gemma_load_weights
|
||||
|
||||
|
||||
def gemma_load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]):
|
||||
stacked_params_mapping = [
|
||||
# (param_name, shard_name, shard_id)
|
||||
("qkv_proj", "q_proj", "q"),
|
||||
("qkv_proj", "k_proj", "k"),
|
||||
("qkv_proj", "v_proj", "v"),
|
||||
("gate_up_proj", "gate_proj", 0),
|
||||
("gate_up_proj", "up_proj", 1),
|
||||
]
|
||||
params_dict = dict(self.named_parameters())
|
||||
loaded_params = set()
|
||||
for name, loaded_weight in weights:
|
||||
for (param_name, shard_name, shard_id) in stacked_params_mapping:
|
||||
if shard_name not in name:
|
||||
continue
|
||||
name = name.replace(shard_name, param_name)
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
param = params_dict[name]
|
||||
weight_loader = param.weight_loader
|
||||
weight_loader(param, loaded_weight, shard_id)
|
||||
break
|
||||
else:
|
||||
# lm_head is not used in vllm as it is tied with embed_token.
|
||||
# To prevent errors, skip loading lm_head.weight.
|
||||
if "lm_head.weight" in name:
|
||||
continue
|
||||
# Skip loading extra bias for GPTQ models.
|
||||
if name.endswith(".bias") and name not in params_dict:
|
||||
continue
|
||||
# GemmaRMSNorm is different from Llama's in that it multiplies
|
||||
# (1 + weight) to the output, instead of just weight.
|
||||
if "norm.weight" in name:
|
||||
norm_weight = loaded_weight + 1.0 # prevent inplace modify actor weights
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, norm_weight)
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
loaded_params.add(name)
|
||||
unloaded_params = params_dict.keys() - loaded_params
|
||||
if unloaded_params:
|
||||
raise RuntimeError("Some weights are not initialized from checkpoints: "
|
||||
f"{unloaded_params}")
|
||||
|
||||
|
||||
def load_hf_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
assert isinstance(actor_weights, Dict)
|
||||
with set_default_torch_dtype(next(vllm_model.parameters()).dtype): # TODO
|
||||
vllm_model.load_weights(actor_weights.items())
|
||||
for _, module in vllm_model.named_modules():
|
||||
quant_method = getattr(module, "quant_method", None)
|
||||
if quant_method is not None:
|
||||
quant_method.process_weights_after_loading(module)
|
||||
# FIXME: Remove this after Mixtral is updated
|
||||
# to use quant_method.
|
||||
if hasattr(module, "process_weights_after_loading"):
|
||||
module.process_weights_after_loading()
|
||||
vllm_model = vllm_model.cuda()
|
||||
306
verl/third_party/vllm/vllm_v_0_4_2/llm.py
vendored
Normal file
306
verl/third_party/vllm/vllm_v_0_4_2/llm.py
vendored
Normal file
@@ -0,0 +1,306 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from transformers import PretrainedConfig
|
||||
import torch.nn as nn
|
||||
from .arg_utils import EngineArgs
|
||||
from .llm_engine_sp import LLMEngine
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import MultiModalData
|
||||
from vllm.usage.usage_lib import UsageContext
|
||||
from vllm.utils import Counter
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
|
||||
|
||||
|
||||
class LLM:
|
||||
"""An LLM for generating texts from given prompts and sampling parameters.
|
||||
|
||||
This class includes a tokenizer, a language model (possibly distributed
|
||||
across multiple GPUs), and GPU memory space allocated for intermediate
|
||||
states (aka KV cache). Given a batch of prompts and sampling parameters,
|
||||
this class generates texts from the model, using an intelligent batching
|
||||
mechanism and efficient memory management.
|
||||
|
||||
NOTE: This class is intended to be used for offline inference. For online
|
||||
serving, use the `AsyncLLMEngine` class instead.
|
||||
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
|
||||
|
||||
Args:
|
||||
model: A HuggingFace Transformers model instance.
|
||||
tokenizer: A HuggingFace Transformers tokenizer instance.
|
||||
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
|
||||
if available, and "slow" will always use the slow tokenizer.
|
||||
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
|
||||
downloading the model and tokenizer.
|
||||
tensor_parallel_size: The number of GPUs to use for distributed
|
||||
execution with tensor parallelism.
|
||||
dtype: The data type for the model weights and activations. Currently,
|
||||
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
|
||||
the `torch_dtype` attribute specified in the model config file.
|
||||
However, if the `torch_dtype` in the config is `float32`, we will
|
||||
use `float16` instead.
|
||||
quantization: The method used to quantize the model weights. Currently,
|
||||
we support "awq". If None, we assume the model weights are not
|
||||
quantized and use `dtype` to determine the data type of the weights.
|
||||
revision: The specific model version to use. It can be a branch name,
|
||||
a tag name, or a commit id.
|
||||
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||
branch name, a tag name, or a commit id.
|
||||
seed: The seed to initialize the random number generator for sampling.
|
||||
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
|
||||
reserve for the model weights, activations, and KV cache. Higher
|
||||
values will increase the KV cache size and thus improve the model's
|
||||
throughput. However, if the value is too high, it may cause out-of-
|
||||
memory (OOM) errors.
|
||||
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
|
||||
This can be used for temporarily storing the states of the requests
|
||||
when their `best_of` sampling parameters are larger than 1. If all
|
||||
requests will have `best_of=1`, you can safely set this to 0.
|
||||
Otherwise, too small values may cause out-of-memory (OOM) errors.
|
||||
enforce_eager: Whether to enforce eager execution. If True, we will
|
||||
disable CUDA graph and always execute the model in eager mode.
|
||||
If False, we will use CUDA graph and eager execution in hybrid.
|
||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
||||
When a sequence has context length larger than this, we fall back
|
||||
to eager mode.
|
||||
disable_custom_all_reduce: See ParallelConfig
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
|
||||
model_hf_config: PretrainedConfig,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tensor_parallel_size: int = 1,
|
||||
dtype: str = "auto",
|
||||
quantization: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
seed: int = 0,
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: bool = False,
|
||||
max_context_len_to_capture: int = None,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
load_format = 'auto',
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "disable_log_stats" not in kwargs:
|
||||
kwargs["disable_log_stats"] = True
|
||||
engine_args = EngineArgs(
|
||||
model_hf_config=model_hf_config,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
dtype=dtype,
|
||||
quantization=quantization,
|
||||
revision=revision,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
seed=seed,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
max_context_len_to_capture=max_context_len_to_capture,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
load_format=load_format,
|
||||
**kwargs,
|
||||
)
|
||||
tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
|
||||
if not isinstance(tokenizer, tokenizer_cls):
|
||||
raise ValueError(
|
||||
f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
|
||||
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
|
||||
)
|
||||
self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
|
||||
self.request_counter = Counter()
|
||||
|
||||
def init_cache_engine(self):
|
||||
self.llm_engine.init_cache_engine()
|
||||
|
||||
def free_cache_engine(self):
|
||||
self.llm_engine.free_cache_engine()
|
||||
|
||||
def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
return self.llm_engine.tokenizer
|
||||
|
||||
def set_tokenizer(
|
||||
self,
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
) -> None:
|
||||
self.llm_engine.tokenizer = tokenizer
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompts: Optional[Union[str, List[str]]] = None,
|
||||
sampling_params: Optional[Union[SamplingParams, List[SamplingParams]]] = None,
|
||||
prompt_token_ids: Optional[List[List[int]]] = None,
|
||||
use_tqdm: bool = True,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None,
|
||||
) -> List[RequestOutput]:
|
||||
"""Generates the completions for the input prompts.
|
||||
|
||||
NOTE: This class automatically batches the given prompts, considering
|
||||
the memory constraint. For the best performance, put all of your prompts
|
||||
into a single list and pass it to this method.
|
||||
|
||||
Args:
|
||||
prompts: A list of prompts to generate completions for.
|
||||
sampling_params: The sampling parameters for text generation. If
|
||||
None, we use the default sampling parameters.
|
||||
When it is a single value, it is applied to every prompt.
|
||||
When it is a list, the list must have the same length as the
|
||||
prompts and it is paired one by one with the prompt.
|
||||
prompt_token_ids: A list of token IDs for the prompts. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
lora_request: LoRA request to use for generation, if any.
|
||||
multi_modal_data: Multi modal data.
|
||||
|
||||
Returns:
|
||||
A list of `RequestOutput` objects containing the generated
|
||||
completions in the same order as the input prompts.
|
||||
"""
|
||||
if prompts is None and prompt_token_ids is None:
|
||||
raise ValueError("Either prompts or prompt_token_ids must be "
|
||||
"provided.")
|
||||
if self.llm_engine.model_config.skip_tokenizer_init \
|
||||
and prompts is not None:
|
||||
raise ValueError("prompts must be None if skip_tokenizer_init "
|
||||
"is True")
|
||||
if isinstance(prompts, str):
|
||||
# Convert a single prompt to a list.
|
||||
prompts = [prompts]
|
||||
if (prompts is not None and prompt_token_ids is not None and len(prompts) != len(prompt_token_ids)):
|
||||
raise ValueError("The lengths of prompts and prompt_token_ids "
|
||||
"must be the same.")
|
||||
|
||||
if prompts is not None:
|
||||
num_requests = len(prompts)
|
||||
else:
|
||||
assert prompt_token_ids is not None
|
||||
num_requests = len(prompt_token_ids)
|
||||
|
||||
if sampling_params is None:
|
||||
# Use default sampling params.
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
elif isinstance(sampling_params, list) and len(sampling_params) != num_requests:
|
||||
raise ValueError("The lengths of prompts and sampling_params "
|
||||
"must be the same.")
|
||||
if multi_modal_data:
|
||||
multi_modal_data.data = multi_modal_data.data.to(torch.float16)
|
||||
|
||||
# Add requests to the engine.
|
||||
for i in range(num_requests):
|
||||
prompt = prompts[i] if prompts is not None else None
|
||||
token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
|
||||
if not isinstance(token_ids, list):
|
||||
# NOTE(shengguangming): convert the rollout input into List[str]
|
||||
token_ids = self._pre_process_inputs(token_ids)
|
||||
self._add_request(
|
||||
prompt,
|
||||
sampling_params[i] if isinstance(sampling_params, list) else sampling_params,
|
||||
token_ids,
|
||||
lora_request=lora_request,
|
||||
# Get ith image while maintaining the batch dim.
|
||||
multi_modal_data=MultiModalData(type=multi_modal_data.type, data=multi_modal_data.data[i].unsqueeze(0))
|
||||
if multi_modal_data else None,
|
||||
)
|
||||
return self._run_engine(use_tqdm)
|
||||
|
||||
def _add_request(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]],
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
multi_modal_data: Optional[MultiModalData] = None,
|
||||
) -> None:
|
||||
request_id = str(next(self.request_counter))
|
||||
self.llm_engine.add_request(request_id,
|
||||
prompt,
|
||||
sampling_params,
|
||||
prompt_token_ids,
|
||||
lora_request=lora_request,
|
||||
multi_modal_data=multi_modal_data)
|
||||
|
||||
def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
|
||||
# Initialize tqdm.
|
||||
if use_tqdm:
|
||||
num_requests = self.llm_engine.get_num_unfinished_requests()
|
||||
pbar = tqdm(total=num_requests, desc="Processed prompts", dynamic_ncols=True)
|
||||
# Run the engine.
|
||||
outputs: List[RequestOutput] = []
|
||||
while self.llm_engine.has_unfinished_requests():
|
||||
step_outputs = self.llm_engine.step()
|
||||
for output in step_outputs:
|
||||
if output.finished:
|
||||
outputs.append(output)
|
||||
if use_tqdm:
|
||||
pbar.update(1)
|
||||
if use_tqdm:
|
||||
pbar.close()
|
||||
# Sort the outputs by request ID.
|
||||
# This is necessary because some requests may be finished earlier than
|
||||
# its previous requests.
|
||||
outputs = sorted(outputs, key=lambda x: int(x.request_id))
|
||||
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
|
||||
return self._post_process_outputs(outputs)
|
||||
|
||||
# NOTE(shengguangming): add for verl
|
||||
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
|
||||
def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
|
||||
# remove the left padding in the prompt token_id
|
||||
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
|
||||
non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
|
||||
token_ids = prompt_token_ids[non_pad_index:].tolist()
|
||||
return token_ids
|
||||
|
||||
# NOTE(shengguangming): add for verl
|
||||
def _post_process_outputs(self, request_outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
output_token_ids = []
|
||||
logprobs = []
|
||||
for request_output in request_outputs: # List[RequestOutput]
|
||||
outputs = request_output.outputs
|
||||
for output in outputs: # List[CompletionOutput], usually len == 1
|
||||
output_token_ids.append(torch.tensor(output.token_ids))
|
||||
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
|
||||
logprobs_dicts = output.logprobs
|
||||
if logprobs_dicts is not None:
|
||||
logprob = []
|
||||
for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
|
||||
logprob.append(logprobs_dict[id].logprob)
|
||||
logprobs.append(torch.tensor(logprob))
|
||||
|
||||
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
|
||||
output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
|
||||
if len(logprobs) > 0:
|
||||
logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
|
||||
return output_token_ids, logprobs
|
||||
|
||||
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
|
||||
self.llm_engine.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
self.llm_engine.offload_model_weights()
|
||||
283
verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
vendored
Normal file
283
verl/third_party/vllm/vllm_v_0_4_2/llm_engine_sp.py
vendored
Normal file
@@ -0,0 +1,283 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
|
||||
|
||||
import torch
|
||||
from typing import Dict, Optional, Union, Type
|
||||
|
||||
import vllm
|
||||
from vllm.config import (CacheConfig, DecodingConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig,
|
||||
SpeculativeConfig, VisionLanguageConfig)
|
||||
from vllm.core.scheduler import Scheduler
|
||||
from vllm.engine.output_processor.interfaces import (SequenceGroupOutputProcessor)
|
||||
from vllm.engine.output_processor.stop_checker import StopChecker
|
||||
from vllm.executor.executor_base import ExecutorBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.detokenizer import Detokenizer
|
||||
from vllm.engine.metrics import StatLogger
|
||||
from vllm.usage.usage_lib import (UsageContext, is_usage_stats_enabled, usage_message)
|
||||
from vllm.utils import Counter
|
||||
from vllm.engine.llm_engine import _load_generation_config_dict
|
||||
from vllm.engine.llm_engine import LLMEngine
|
||||
|
||||
import torch.nn as nn
|
||||
from .arg_utils import EngineArgs
|
||||
from .tokenizer import TokenizerGroup
|
||||
from .config import ModelConfig, LoadConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
||||
|
||||
|
||||
class LLMEngine(LLMEngine):
|
||||
"""An LLM engine that receives requests and generates texts.
|
||||
|
||||
This is the main class for the vLLM engine. It receives requests
|
||||
from clients and generates texts from the LLM. It includes a tokenizer, a
|
||||
language model (possibly distributed across multiple GPUs), and GPU memory
|
||||
space allocated for intermediate states (aka KV cache). This class utilizes
|
||||
iteration-level scheduling and efficient memory management to maximize the
|
||||
serving throughput.
|
||||
|
||||
The `LLM` class wraps this class for offline batched inference and the
|
||||
`AsyncLLMEngine` class wraps this class for online serving.
|
||||
|
||||
NOTE: The config arguments are derived from the `EngineArgs` class. For the
|
||||
comprehensive list of arguments, see `EngineArgs`.
|
||||
|
||||
Args:
|
||||
model: the actor model initialize outside vllm (add for verl)
|
||||
tokenizer: the initialized tokenizer (add for verl)
|
||||
model_config: The configuration related to the LLM model.
|
||||
cache_config: The configuration related to the KV cache memory
|
||||
management.
|
||||
parallel_config: The configuration related to distributed execution.
|
||||
scheduler_config: The configuration related to the request scheduler.
|
||||
distributed_init_method: The initialization method for distributed
|
||||
execution. See `torch.distributed.init_process_group` for details.
|
||||
placement_group: Ray placement group for distributed execution.
|
||||
Required for distributed execution.
|
||||
log_stats: Whether to log statistics.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
# NOTE(sgm): first two arguments are added for verl
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
tokenizer: nn.Module,
|
||||
# NOTE(sgm): vllm original arguments
|
||||
model_config: ModelConfig,
|
||||
cache_config: CacheConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
vision_language_config: Optional[VisionLanguageConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
decoding_config: Optional[DecodingConfig],
|
||||
executor_class: Type[ExecutorBase],
|
||||
log_stats: bool,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
) -> None:
|
||||
logger.info(
|
||||
"Initializing an LLM engine (v%s) with config: "
|
||||
"model=%r, speculative_config=%r, tokenizer=%r, "
|
||||
"skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, "
|
||||
"tokenizer_revision=%s, trust_remote_code=%s, dtype=%s, "
|
||||
"max_seq_len=%d, download_dir=%r, load_format=%s, "
|
||||
"tensor_parallel_size=%d, disable_custom_all_reduce=%s, "
|
||||
"quantization=%s, enforce_eager=%s, kv_cache_dtype=%s, "
|
||||
"quantization_param_path=%s, device_config=%s, "
|
||||
"decoding_config=%r, seed=%d, served_model_name=%s)",
|
||||
vllm.__version__,
|
||||
model_config.model,
|
||||
speculative_config,
|
||||
model_config.tokenizer,
|
||||
model_config.skip_tokenizer_init,
|
||||
# model_config.tokenizer_mode,
|
||||
model_config.revision,
|
||||
model_config.tokenizer_revision,
|
||||
# model_config.trust_remote_code,
|
||||
model_config.dtype,
|
||||
model_config.max_model_len,
|
||||
load_config.download_dir,
|
||||
load_config.load_format,
|
||||
parallel_config.tensor_parallel_size,
|
||||
parallel_config.disable_custom_all_reduce,
|
||||
model_config.quantization,
|
||||
model_config.enforce_eager,
|
||||
cache_config.cache_dtype,
|
||||
model_config.quantization_param_path,
|
||||
device_config.device,
|
||||
decoding_config,
|
||||
model_config.seed,
|
||||
# model_config.served_model_name,
|
||||
)
|
||||
# TODO(woosuk): Print more configs in debug mode.
|
||||
|
||||
self.model_config = model_config # TODO: currently is hfconfig
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.vision_language_config = vision_language_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.speculative_config = speculative_config
|
||||
self.load_config = load_config
|
||||
self.decoding_config = decoding_config or DecodingConfig()
|
||||
self.log_stats = log_stats
|
||||
|
||||
# self.model = model # should not store the model, it should be deleted
|
||||
# TODO(shengguangming): maybe we can choose init here or from arguments
|
||||
if not self.model_config.skip_tokenizer_init:
|
||||
# TODO: check tokenizer class
|
||||
self._init_tokenizer(tokenizer)
|
||||
self.detokenizer = Detokenizer(self.tokenizer)
|
||||
else:
|
||||
self.detokenizer = None
|
||||
self.tokenizer = None
|
||||
|
||||
self.seq_counter = Counter()
|
||||
# TODO: don't know what's the usage
|
||||
self.generation_config_fields = _load_generation_config_dict(model_config)
|
||||
|
||||
self.model_executor = executor_class(
|
||||
model=model, # add for spmd_gpu_executor
|
||||
model_config=model_config,
|
||||
cache_config=cache_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
speculative_config=speculative_config,
|
||||
load_config=load_config,
|
||||
)
|
||||
|
||||
# Profile the memory usage and initialize the cache.
|
||||
self._initialize_kv_caches()
|
||||
|
||||
# If usage stat is enabled, collect relevant info.
|
||||
if is_usage_stats_enabled():
|
||||
from vllm.model_executor.model_loader import (get_architecture_class_name)
|
||||
usage_message.report_usage(
|
||||
get_architecture_class_name(model_config),
|
||||
usage_context,
|
||||
extra_kvs={
|
||||
# Common configuration
|
||||
"dtype": str(model_config.dtype),
|
||||
"tensor_parallel_size": parallel_config.tensor_parallel_size,
|
||||
"block_size": cache_config.block_size,
|
||||
"gpu_memory_utilization": cache_config.gpu_memory_utilization,
|
||||
|
||||
# Quantization
|
||||
"quantization": model_config.quantization,
|
||||
"kv_cache_dtype": cache_config.cache_dtype,
|
||||
|
||||
# Feature flags
|
||||
"enable_lora": bool(lora_config),
|
||||
"enable_prefix_caching": cache_config.enable_prefix_caching,
|
||||
"enforce_eager": model_config.enforce_eager,
|
||||
"disable_custom_all_reduce": parallel_config.disable_custom_all_reduce,
|
||||
})
|
||||
|
||||
if self.tokenizer:
|
||||
# Ping the tokenizer to ensure liveness if it runs in a
|
||||
# different process.
|
||||
self.tokenizer.ping()
|
||||
|
||||
# Create the scheduler.
|
||||
# NOTE: the cache_config here have been updated with the numbers of
|
||||
# GPU and CPU blocks, which are profiled in the distributed executor.
|
||||
# NOTE(shengguangming): each process will have independent scheduler
|
||||
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
||||
|
||||
# Metric Logging.
|
||||
if self.log_stats:
|
||||
self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC,
|
||||
labels=dict(model_name=model_config.served_model_name),
|
||||
max_model_len=self.model_config.max_model_len)
|
||||
self.stat_logger.info("cache_config", self.cache_config)
|
||||
|
||||
# Create sequence output processor, e.g. for beam search or
|
||||
# speculative decoding.
|
||||
self.output_processor = (SequenceGroupOutputProcessor.create_output_processor(
|
||||
self.scheduler_config,
|
||||
self.detokenizer,
|
||||
self.scheduler,
|
||||
self.seq_counter,
|
||||
self.get_tokenizer_for_seq,
|
||||
stop_checker=StopChecker(
|
||||
self.scheduler_config.max_model_len,
|
||||
self.get_tokenizer_for_seq,
|
||||
),
|
||||
))
|
||||
|
||||
# TODO(sgm): add for verl but we may not tokenizer in Rollout
|
||||
def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
|
||||
init_kwargs = dict(enable_lora=bool(self.lora_config),
|
||||
max_num_seqs=self.scheduler_config.max_num_seqs,
|
||||
max_input_length=None)
|
||||
init_kwargs.update(tokenizer_init_kwargs)
|
||||
self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
|
||||
|
||||
def init_cache_engine(self):
|
||||
# TODO: check whether we should rebuild the CUDAGraph every iter when offload/load KVCache
|
||||
# Re-capture CUDAGraph would be time-consuming
|
||||
self.model_executor.init_cache_engine()
|
||||
|
||||
def free_cache_engine(self):
|
||||
self.model_executor.free_cache_engine()
|
||||
|
||||
# NOTE(sgm): currently, we only support GPU executor
|
||||
# The GPUExecutor remove the Ray dependency
|
||||
@classmethod
|
||||
def from_engine_args(
|
||||
cls,
|
||||
model,
|
||||
tokenizer,
|
||||
engine_args: EngineArgs,
|
||||
usage_context: UsageContext = UsageContext.ENGINE_CONTEXT,
|
||||
) -> "LLMEngine":
|
||||
"""Creates an LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_config = engine_args.create_engine_config()
|
||||
|
||||
# Initialize the cluster and specify the executor class.
|
||||
assert engine_config.device_config.device_type == "cuda", \
|
||||
"Currently, the vllm in verl only support running on GPU"
|
||||
|
||||
if engine_config.parallel_config.world_size == 1:
|
||||
engine_config.load_config.load_format = "dummy_hf"
|
||||
|
||||
from .spmd_gpu_executor import SPMDGPUExecutor
|
||||
executor_class = SPMDGPUExecutor
|
||||
|
||||
# Create the LLM engine.
|
||||
engine = cls(
|
||||
model,
|
||||
tokenizer,
|
||||
**engine_config.to_dict(),
|
||||
executor_class=executor_class,
|
||||
log_stats=not engine_args.disable_log_stats,
|
||||
usage_context=usage_context,
|
||||
)
|
||||
return engine
|
||||
|
||||
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
|
||||
self.model_executor.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
self.model_executor.offload_model_weights()
|
||||
348
verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
vendored
Normal file
348
verl/third_party/vllm/vllm_v_0_4_2/megatron_weight_loaders.py
vendored
Normal file
@@ -0,0 +1,348 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
|
||||
|
||||
from typing import Dict
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.model_executor.layers.linear import *
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
|
||||
from vllm.model_executor.layers.activation import ScaledActivation
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
|
||||
|
||||
# NOTE(shengguangming): replace the origin weight loader function in the class
|
||||
def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
|
||||
"""Parallel Linear weight loader."""
|
||||
assert param.size() == loaded_weight.size(
|
||||
), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
|
||||
param.size(), loaded_weight.size())
|
||||
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
|
||||
|
||||
param.data = loaded_weight.data
|
||||
|
||||
|
||||
def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
|
||||
"""Default weight loader."""
|
||||
assert param.size() == loaded_weight.size()
|
||||
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
|
||||
|
||||
param.data = loaded_weight.data
|
||||
|
||||
|
||||
def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "lm_head.weight" in name:
|
||||
# GPT-2 ties the weights of the embedding layer and the final
|
||||
# linear layer.
|
||||
continue
|
||||
if ".attn.bias" in name or ".attn.masked_bias" in name:
|
||||
# Skip attention mask.
|
||||
# NOTE: "c_attn.bias" should not be skipped.
|
||||
continue
|
||||
if not name.startswith("transformer."):
|
||||
name = "transformer." + name
|
||||
param = params_dict[name]
|
||||
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
|
||||
# Because of this, we need to transpose the weights.
|
||||
# Note(zhuohan): the logic below might break quantized models.
|
||||
for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
|
||||
if conv1d_weight_name not in name:
|
||||
continue
|
||||
if not name.endswith(".weight"):
|
||||
continue
|
||||
# TODO: check megatron
|
||||
loaded_weight = loaded_weight.t()
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def llama_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_mapping = [
|
||||
# (megatron core gpt model name, vllm model name)
|
||||
("embedding.word_embeddings", "model.embed_tokens"),
|
||||
("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
|
||||
("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_proj", 'self_attn.o_proj'),
|
||||
('pre_mlp_layernorm', 'post_attention_layernorm'),
|
||||
('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
|
||||
('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
|
||||
('mlp.linear_fc1', 'mlp.gate_up_proj'),
|
||||
('mlp.linear_fc2', 'mlp.down_proj'),
|
||||
('decoder.final_layernorm', 'model.norm'),
|
||||
('output_layer', 'lm_head'),
|
||||
]
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
name = _replace_name(name, params_mapping)
|
||||
if name.endswith('.bias') and name not in params_dict:
|
||||
continue
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_mapping = [
|
||||
# (megatron core gpt model name, vllm model name)
|
||||
("embedding.word_embeddings", "model.embed_tokens"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_proj", 'self_attn.o_proj'),
|
||||
(
|
||||
'input_layernorm',
|
||||
'input_layernorm',
|
||||
),
|
||||
('pre_mlp_layernorm', 'post_attention_layernorm'),
|
||||
('mlp.linear_fc1', 'mlp.gate_up_proj'),
|
||||
('mlp.linear_fc2', 'mlp.down_proj'),
|
||||
('decoder.final_layernorm', 'model.norm'),
|
||||
('output_layer', 'lm_head'),
|
||||
]
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
name = _replace_name(name, params_mapping)
|
||||
if name.endswith('.bias') and name not in params_dict:
|
||||
continue
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def _replace_name(megatron_name, name_mapping):
|
||||
for m_name, v_name in name_mapping:
|
||||
if m_name not in megatron_name:
|
||||
continue
|
||||
if 'layers' in megatron_name: # deal with decoder layers
|
||||
megatron_name = megatron_name.replace('decoder', 'model')
|
||||
megatron_name_list = megatron_name.split('.')
|
||||
if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
|
||||
param_name_list = megatron_name_list[:3]
|
||||
param_name_list.append(v_name)
|
||||
param_name = '.'.join(param_name_list)
|
||||
else:
|
||||
param_name_list = megatron_name_list[:3]
|
||||
weight_or_bias = megatron_name_list[-1]
|
||||
param_name_list.append(v_name)
|
||||
param_name_list.append(weight_or_bias)
|
||||
param_name = '.'.join(param_name_list)
|
||||
return param_name
|
||||
else:
|
||||
param_name = megatron_name.replace(m_name, v_name)
|
||||
return param_name
|
||||
|
||||
|
||||
def llama_megatron_core_te_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_mapping = [
|
||||
# (megatron core gpt model name, vllm model name)
|
||||
("embedding.word_embeddings", "model.embed_tokens"),
|
||||
("self_attention.linear_qkv.layer_norm_weight", "input_layernorm.weight"),
|
||||
("self_attention.linear_qkv.layer_norm_bias", "input_layernorm.bias"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_proj", 'self_attn.o_proj'),
|
||||
('pre_mlp_layernorm', 'post_attention_layernorm'),
|
||||
('mlp.linear_fc1.layer_norm_weight', 'post_attention_layernorm.weight'),
|
||||
('mlp.linear_fc1.layer_norm_bias', 'post_attention_layernorm.bias'),
|
||||
('mlp.linear_fc1', 'mlp.gate_up_proj'),
|
||||
('mlp.linear_fc2', 'mlp.down_proj'),
|
||||
('decoder.final_layernorm', 'model.norm'),
|
||||
('output_layer', 'lm_head'),
|
||||
]
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
name = _replace_name(name, params_mapping)
|
||||
if name.endswith('.bias') and name not in params_dict:
|
||||
continue
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def llama_megatron_core_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_mapping = [
|
||||
# (megatron core gpt model name, vllm model name)
|
||||
("embedding.word_embeddings", "model.embed_tokens"),
|
||||
("self_attention.linear_qkv", "self_attn.qkv_proj"),
|
||||
("self_attention.linear_proj", 'self_attn.o_proj'),
|
||||
(
|
||||
'input_layernorm',
|
||||
'input_layernorm',
|
||||
),
|
||||
('pre_mlp_layernorm', 'post_attention_layernorm'),
|
||||
('mlp.linear_fc1', 'mlp.gate_up_proj'),
|
||||
('mlp.linear_fc2', 'mlp.down_proj'),
|
||||
('decoder.final_layernorm', 'model.norm'),
|
||||
('output_layer', 'lm_head'),
|
||||
]
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
name = _replace_name(name, params_mapping)
|
||||
if name.endswith('.bias') and name not in params_dict:
|
||||
continue
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def _replace_name(megatron_name, name_mapping):
|
||||
for m_name, v_name in name_mapping:
|
||||
if m_name not in megatron_name:
|
||||
continue
|
||||
if 'layers' in megatron_name: # deal with decoder layers
|
||||
megatron_name = megatron_name.replace('decoder', 'model')
|
||||
megatron_name_list = megatron_name.split('.')
|
||||
if 'layer_norm_weight' in megatron_name_list or 'layer_norm_bias' in megatron_name_list:
|
||||
param_name_list = megatron_name_list[:3]
|
||||
param_name_list.append(v_name)
|
||||
param_name = '.'.join(param_name_list)
|
||||
else:
|
||||
param_name_list = megatron_name_list[:3]
|
||||
weight_or_bias = megatron_name_list[-1]
|
||||
param_name_list.append(v_name)
|
||||
param_name_list.append(weight_or_bias)
|
||||
param_name = '.'.join(param_name_list)
|
||||
return param_name
|
||||
else:
|
||||
param_name = megatron_name.replace(m_name, v_name)
|
||||
return param_name
|
||||
|
||||
|
||||
def mistral_megatron_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
# TODO: need to implement a general way to deal with prefix
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
__LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__ = {
|
||||
ColumnParallelLinear: parallel_weight_loader,
|
||||
MergedColumnParallelLinear: parallel_weight_loader,
|
||||
QKVParallelLinear: parallel_weight_loader,
|
||||
RowParallelLinear: parallel_weight_loader,
|
||||
VocabParallelEmbedding: parallel_weight_loader,
|
||||
ParallelLMHead: parallel_weight_loader
|
||||
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
|
||||
# "default_weight_loader": default_weight_loader
|
||||
}
|
||||
|
||||
# for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
|
||||
# # setattr(layer_class, 'megatron_weight_loader', weight_loader)
|
||||
# layer_class.weight_loader = weight_loader
|
||||
|
||||
__MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__ = {
|
||||
'GPT2LMHeadModel': gpt2_weight_loader,
|
||||
'LlamaForCausalLM': llama_megatron_core_te_weight_loader, # use te backend for open-source megatron
|
||||
'LLaMAForCausalLM': llama_megatron_core_te_weight_loader,
|
||||
'MistralForCausalLM': mistral_megatron_weight_loader,
|
||||
}
|
||||
|
||||
|
||||
# the actor model is .state_dict()
|
||||
# Load megatron weights
|
||||
def load_megatron_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
|
||||
weight_loader(actor_weights, vllm_model)
|
||||
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
|
||||
# after init, and we need this after sync model weights for in first iter.
|
||||
vllm_model = vllm_model.cuda()
|
||||
|
||||
|
||||
def _get_model_weight_loader(arch: str):
|
||||
if arch in __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__:
|
||||
return __MODEL_MEGATRON_WEIGHT_LOADER_REGISTRY__[arch]
|
||||
raise ValueError(f"Model architectures {arch} are not supported for now. "
|
||||
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
|
||||
|
||||
|
||||
def update_megatron_weight_loader():
|
||||
for layer_class, weight_loader in __LAYER_WEIGHT_MEGATRON_LOADER_REGISTRY__.items():
|
||||
layer_class.weight_loader = weight_loader
|
||||
VocabParallelEmbedding.__init__ = vocab_init
|
||||
|
||||
|
||||
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
|
||||
# so we need to rewrite the init function of vocab
|
||||
DEFAULT_VOCAB_PADDING_SIZE = 64
|
||||
|
||||
|
||||
def vocab_init(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
|
||||
super(VocabParallelEmbedding, self).__init__()
|
||||
|
||||
# Keep the input dimensions.
|
||||
# TODO (pad to be divided by 4)
|
||||
self.num_embeddings = num_embeddings
|
||||
self.org_vocab_size = org_num_embeddings or num_embeddings
|
||||
|
||||
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
|
||||
# padding_size)
|
||||
self.embedding_dim = embedding_dim
|
||||
if params_dtype is None:
|
||||
params_dtype = torch.get_default_dtype()
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
# Divide the weight matrix along the vocaburaly dimension.
|
||||
|
||||
# TODO: remove dependencies from megatron
|
||||
from megatron.core.tensor_parallel.utils import VocabUtility
|
||||
self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
|
||||
self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
|
||||
self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
|
||||
self.weight = Parameter(
|
||||
torch.empty(
|
||||
self.num_embeddings_per_partition,
|
||||
self.embedding_dim,
|
||||
# device=torch.cuda.current_device(),
|
||||
dtype=params_dtype))
|
||||
set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})
|
||||
265
verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
vendored
Normal file
265
verl/third_party/vllm/vllm_v_0_4_2/model_loader.py
vendored
Normal file
@@ -0,0 +1,265 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
|
||||
"""Utilities for selecting and loading models."""
|
||||
from typing import Dict, Union, Optional, Iterable, Tuple
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PreTrainedModel
|
||||
|
||||
from vllm.config import (DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
|
||||
from vllm.model_executor.model_loader import BaseModelLoader
|
||||
from vllm.model_executor.model_loader.loader import _initialize_model
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.distributed.communication_op import tensor_model_parallel_all_gather
|
||||
|
||||
from .config import ModelConfig, LoadFormat, LoadConfig
|
||||
from .megatron_weight_loaders import load_megatron_weights, update_megatron_weight_loader
|
||||
from .dtensor_weight_loaders import load_dtensor_weights, update_dtensor_weight_loader
|
||||
from .hf_weight_loader import update_hf_weight_loader
|
||||
|
||||
|
||||
def get_model(actor_model: Union[PreTrainedModel, Dict], model_config: ModelConfig, load_config: LoadConfig,
|
||||
device_config: DeviceConfig, parallel_config: ParallelConfig, scheduler_config: SchedulerConfig,
|
||||
lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig]) -> nn.Module:
|
||||
loader = get_model_loader(load_config)
|
||||
if load_config.load_format.startswith('dummy'):
|
||||
return loader.load_model(model_config=model_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config)
|
||||
else:
|
||||
return loader.load_model(actor_model=actor_model,
|
||||
model_config=model_config,
|
||||
device_config=device_config,
|
||||
lora_config=lora_config,
|
||||
vision_language_config=vision_language_config,
|
||||
parallel_config=parallel_config,
|
||||
scheduler_config=scheduler_config)
|
||||
|
||||
|
||||
def get_model_loader(load_config: LoadConfig) -> BaseModelLoader:
|
||||
"""Get a model loader based on the load format."""
|
||||
|
||||
if isinstance(load_config.load_format, type):
|
||||
return load_config.load_format(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.AUTO:
|
||||
update_megatron_weight_loader()
|
||||
return MegatronLoader(load_config)
|
||||
|
||||
# NOTE(sgm): change the weight_loader function in runtime
|
||||
if load_config.load_format == LoadFormat.MEGATRON:
|
||||
update_megatron_weight_loader()
|
||||
return MegatronLoader(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.HF:
|
||||
update_hf_weight_loader()
|
||||
return HFLoader(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.DTENSOR:
|
||||
update_dtensor_weight_loader()
|
||||
return DTensorLoader(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.DUMMY_HF:
|
||||
update_hf_weight_loader()
|
||||
return DummyModelLoader(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.DUMMY_MEGATRON:
|
||||
update_megatron_weight_loader()
|
||||
return DummyModelLoader(load_config)
|
||||
|
||||
if load_config.load_format == LoadFormat.DUMMY_DTENSOR:
|
||||
update_dtensor_weight_loader()
|
||||
return DummyModelLoader(load_config)
|
||||
|
||||
raise ValueError('load format not supported in verl: {}, only support {} and {}'.format(
|
||||
load_config.load_format, LoadFormat.MEGATRON, LoadFormat.HF))
|
||||
|
||||
|
||||
class DummyModelLoader(BaseModelLoader):
|
||||
"""Model loader that will set model weights to random values."""
|
||||
|
||||
def __init__(self, load_config: LoadConfig):
|
||||
super().__init__(load_config)
|
||||
if load_config.model_loader_extra_config:
|
||||
raise ValueError(f"Model loader extra config is not supported for "
|
||||
f"load format {load_config.load_format}")
|
||||
|
||||
def load_model(self, *, model_config: ModelConfig, device_config: DeviceConfig, lora_config: Optional[LoRAConfig],
|
||||
vision_language_config: Optional[VisionLanguageConfig], parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
|
||||
# NOTE(woosuk): For accurate performance evaluation, we assign
|
||||
# random values to the weights.
|
||||
# initialize_dummy_weights(model)
|
||||
return model.eval()
|
||||
|
||||
|
||||
class MegatronLoader(BaseModelLoader):
|
||||
"""Model loader that can load the model weights from partitioned megatron model."""
|
||||
|
||||
def __init__(self, load_config: LoadConfig):
|
||||
super().__init__(load_config)
|
||||
if load_config.model_loader_extra_config:
|
||||
raise ValueError(f"Model loader extra config is not supported for "
|
||||
f"load format {load_config.load_format}")
|
||||
|
||||
def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
|
||||
# NOTE(shengguangming) Load the weights from the actor model
|
||||
pass
|
||||
# if isinstance(actor_model, nn.Module):
|
||||
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
|
||||
# else:
|
||||
# load_weights(actor_weights=actor_model, vllm_model=model)
|
||||
# return actor_model
|
||||
|
||||
def load_model(self, actor_model: Union[PreTrainedModel,
|
||||
Dict], model_config: ModelConfig, device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
|
||||
parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
|
||||
|
||||
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
|
||||
if isinstance(actor_model, nn.Module):
|
||||
load_megatron_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
|
||||
vllm_model=model)
|
||||
else:
|
||||
load_megatron_weights(actor_weights=actor_model, vllm_model=model)
|
||||
|
||||
for _, module in model.named_modules():
|
||||
quant_method = getattr(module, "quant_method", None)
|
||||
if quant_method is not None:
|
||||
quant_method.process_weights_after_loading(module)
|
||||
# FIXME: Remove this after Mixtral is updated
|
||||
# to use quant_method.
|
||||
if hasattr(module, "process_weights_after_loading"):
|
||||
module.process_weights_after_loading()
|
||||
# NOTE(sgm) Some weights are point to gpu, but still need this.
|
||||
model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
|
||||
return model.eval()
|
||||
|
||||
|
||||
class HFLoader(BaseModelLoader):
|
||||
"""Model loader that can load the model weights from model's full params."""
|
||||
|
||||
def __init__(self, load_config: LoadConfig):
|
||||
super().__init__(load_config)
|
||||
if load_config.model_loader_extra_config:
|
||||
raise ValueError(f"Model loader extra config is not supported for "
|
||||
f"load format {load_config.load_format}")
|
||||
|
||||
def _get_weights_iterator(self, actor_model: Union[PreTrainedModel, Dict]):
|
||||
if isinstance(actor_model, Dict):
|
||||
return actor_model.items()
|
||||
elif isinstance(actor_model, nn.Module):
|
||||
return dict(actor_model.named_parameters()).items()
|
||||
else:
|
||||
raise ValueError(f'actor model should be Dict or nn.Module, but get {type(actor_model)}')
|
||||
|
||||
def load_model(self, actor_model: Union[PreTrainedModel,
|
||||
Dict], model_config: ModelConfig, device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
|
||||
parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
# with torch.device(device_config.device):
|
||||
# NOTE(sgm): init the model in cpu
|
||||
model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
|
||||
model.load_weights(self._get_weights_iterator(actor_model))
|
||||
for _, module in model.named_modules():
|
||||
quant_method = getattr(module, "quant_method", None)
|
||||
if quant_method is not None:
|
||||
quant_method.process_weights_after_loading(module)
|
||||
# FIXME: Remove this after Mixtral is updated
|
||||
# to use quant_method.
|
||||
if hasattr(module, "process_weights_after_loading"):
|
||||
module.process_weights_after_loading()
|
||||
# NOTE(sgm) Some weights are point to gpu, but still need this.
|
||||
model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
|
||||
return model.eval()
|
||||
|
||||
|
||||
class DTensorLoader(BaseModelLoader):
|
||||
"""Model loader that can load the model weights from partitioned megatron model."""
|
||||
|
||||
def __init__(self, load_config: LoadConfig):
|
||||
super().__init__(load_config)
|
||||
if load_config.model_loader_extra_config:
|
||||
raise ValueError(f"Model loader extra config is not supported for "
|
||||
f"load format {load_config.load_format}")
|
||||
|
||||
def _get_weights_iterator(actor_model: Union[PreTrainedModel, Dict]):
|
||||
# NOTE(shengguangming) Load the weights from the actor model
|
||||
pass
|
||||
# if isinstance(actor_model, nn.Module):
|
||||
# load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
|
||||
# else:
|
||||
# load_weights(actor_weights=actor_model, vllm_model=model)
|
||||
# return actor_model
|
||||
|
||||
def load_model(self, actor_model: Union[PreTrainedModel,
|
||||
Dict], model_config: ModelConfig, device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig], vision_language_config: Optional[VisionLanguageConfig],
|
||||
parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module:
|
||||
with set_default_torch_dtype(model_config.dtype):
|
||||
with torch.device(device_config.device):
|
||||
model = _initialize_model(model_config, self.load_config, lora_config, vision_language_config)
|
||||
|
||||
# TODO(sgm): This is a hack, we need to register the load_weight() func for each model in vllm
|
||||
if isinstance(actor_model, nn.Module):
|
||||
load_dtensor_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)),
|
||||
vllm_model=model)
|
||||
else:
|
||||
load_dtensor_weights(actor_weights=actor_model, vllm_model=model)
|
||||
|
||||
for _, module in model.named_modules():
|
||||
quant_method = getattr(module, "quant_method", None)
|
||||
if quant_method is not None:
|
||||
quant_method.process_weights_after_loading(module)
|
||||
# FIXME: Remove this after Mixtral is updated
|
||||
# to use quant_method.
|
||||
if hasattr(module, "process_weights_after_loading"):
|
||||
module.process_weights_after_loading()
|
||||
# NOTE(sgm) Some weights are point to gpu, but still need this.
|
||||
model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
|
||||
return model.eval()
|
||||
|
||||
|
||||
# FIXME(sgm): hack the _get_logits function in vllm v0.4.2
|
||||
# as they use ray, the _get_logits result will only need to return to the driver node,
|
||||
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
|
||||
# all_gather is required (aligned with v0.2.6)
|
||||
def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
|
||||
embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
|
||||
# Get the logits for the next tokens.
|
||||
logits = torch.matmul(hidden_states, embedding.t())
|
||||
if embedding_bias is not None:
|
||||
logits += embedding_bias
|
||||
logits = tensor_model_parallel_all_gather(logits)
|
||||
# Remove paddings in vocab (if any).
|
||||
if logits is not None:
|
||||
logits = logits[:, :self.org_vocab_size]
|
||||
return logits
|
||||
|
||||
|
||||
from vllm.model_executor.layers.logits_processor import LogitsProcessor
|
||||
|
||||
LogitsProcessor._get_logits = _get_logits
|
||||
281
verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
vendored
Normal file
281
verl/third_party/vllm/vllm_v_0_4_2/model_runner.py
vendored
Normal file
@@ -0,0 +1,281 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from enum import IntEnum
|
||||
from typing import Dict, List, Optional, Set, Tuple, Union
|
||||
|
||||
from vllm.attention import (AttentionMetadata, get_attn_backend)
|
||||
from vllm.config import (DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
|
||||
from vllm.model_executor import SamplingMetadata
|
||||
from vllm.sequence import (MultiModalData, SamplerOutput, SequenceData, SequenceGroupMetadata)
|
||||
from vllm.utils import (CudaMemoryProfiler, is_hip, is_pin_memory_available)
|
||||
from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner
|
||||
|
||||
from .model_loader import get_model
|
||||
from .config import ModelConfig, LoadConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
# How batches are constructed.
|
||||
class BatchType(IntEnum):
|
||||
# Every batch is prefill.
|
||||
PREFILL = 0
|
||||
# Every batch is decode.
|
||||
DECODE = 1
|
||||
# Batch is a mixture of prefill and decode.
|
||||
MIXED = 2
|
||||
|
||||
|
||||
class ModelRunner(ModelRunner):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
vision_language_config: Optional[VisionLanguageConfig] = None,
|
||||
):
|
||||
self.model_config = model_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.lora_config = lora_config
|
||||
self.load_config = load_config
|
||||
|
||||
# model_config can be None in tests/samplers/test_sampler.py.
|
||||
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
|
||||
self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None)
|
||||
self.device_config = (device_config if device_config is not None else DeviceConfig())
|
||||
self.device = self.device_config.device
|
||||
|
||||
# NOTE(sgm): add for verl
|
||||
self.model = model # this will be replaced by get_model()
|
||||
|
||||
# Set after load_model.
|
||||
self.lora_manager: LRUCacheWorkerLoRAManager = None
|
||||
|
||||
self.graph_runners: Dict[int, CUDAGraphRunner] = {}
|
||||
self.graph_memory_pool: Optional[Tuple[int, int]] = None # Set during graph capture.
|
||||
|
||||
self.max_seq_len_to_capture = (self.model_config.max_seq_len_to_capture if self.model_config is not None else 0)
|
||||
|
||||
self.pin_memory = is_pin_memory_available()
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
self.vision_language_config = vision_language_config
|
||||
|
||||
self.attn_backend = get_attn_backend(self.model_config.dtype if model_config is not None else None)
|
||||
|
||||
# Lazy initialization
|
||||
self.block_size: int # Set after initial profiling.
|
||||
# When using CUDA graph, the input block tables must be padded to
|
||||
# max_seq_len_to_capture. However, creating the block table in
|
||||
# Python can be expensive. To optimize this, we cache the block table
|
||||
# in numpy and only copy the actual input content at every iteration.
|
||||
# The shape of the cached block table will be
|
||||
# (max batch size to capture, max context len to capture / block size).
|
||||
self.graph_block_tables: torch.Tensor # Set after initial profiling.
|
||||
|
||||
# Set if the backend is flashinfer.
|
||||
self.flashinfer_workspace_buffer: torch.Tensor
|
||||
|
||||
# NOTE(sgm): initialize model using the actor model
|
||||
def load_model(self) -> None:
|
||||
with CudaMemoryProfiler() as m:
|
||||
self.model = get_model(actor_model=self.model,
|
||||
model_config=self.model_config,
|
||||
device_config=self.device_config,
|
||||
lora_config=self.lora_config,
|
||||
load_config=self.load_config,
|
||||
parallel_config=self.parallel_config,
|
||||
scheduler_config=self.scheduler_config,
|
||||
vision_language_config=self.vision_language_config)
|
||||
self.model_memory_usage = m.consumed_memory
|
||||
logger.info("Loading model weights took %.4f GB", self.model_memory_usage / float(2**30))
|
||||
|
||||
if self.lora_config:
|
||||
assert hasattr(self.model, "supported_lora_modules") and self.model.supported_lora_modules, (
|
||||
"Model does not support LoRA")
|
||||
assert hasattr(self.model, "embedding_modules"), "Model does not have embedding_modules"
|
||||
assert hasattr(self.model, "embedding_padding_modules"), "Model does not have embedding_padding_modules"
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(self.scheduler_config.max_num_seqs,
|
||||
self.scheduler_config.max_num_batched_tokens, self.vocab_size,
|
||||
self.lora_config, self.device, self.model.embedding_modules,
|
||||
self.model.embedding_padding_modules)
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
|
||||
if self.kv_cache_dtype == "fp8" and is_hip():
|
||||
# Currently scaled KV cache is only enabled on ROCm
|
||||
if self.model_config.quantization_param_path is not None:
|
||||
if callable(getattr(self.model, "load_kv_cache_scales", None)):
|
||||
self.model.load_kv_cache_scales(self.model_config.quantization_param_path)
|
||||
else:
|
||||
raise RuntimeError(
|
||||
"Using FP8 KV cache and scaling factors provided but "
|
||||
"model %s does not support loading scaling factors.", self.model.__class__)
|
||||
else:
|
||||
logger.warning("Using FP8 KV cache but no scaling factors "
|
||||
"provided. Defaulting to scaling factors of 1.0. "
|
||||
"This may lead to less accurate results!")
|
||||
elif self.model_config.quantization_param_path is not None:
|
||||
logger.warning("KV cache scaling factors provided, "
|
||||
"but the KV cache data type is not FP8. "
|
||||
"KV cache scaling factors will not be used.")
|
||||
|
||||
def prepare_input_tensors(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, AttentionMetadata, SamplingMetadata, Set[LoRARequest], LoRAMapping,
|
||||
torch.Tensor]:
|
||||
# NOTE(sgm): all workers prepare the input in the same way
|
||||
prefill_reqs = []
|
||||
decode_reqs = []
|
||||
for seq_group_meta in seq_group_metadata_list:
|
||||
if seq_group_meta.is_prompt:
|
||||
prefill_reqs.append(seq_group_meta)
|
||||
else:
|
||||
decode_reqs.append(seq_group_meta)
|
||||
|
||||
# Prepare input tensors.
|
||||
(
|
||||
input_tokens,
|
||||
input_positions,
|
||||
prefill_attn_metadata,
|
||||
seq_lens,
|
||||
query_lens,
|
||||
lora_index_mapping,
|
||||
lora_prompt_mapping,
|
||||
lora_requests,
|
||||
multi_modal_input,
|
||||
slot_mapping,
|
||||
) = self._prepare_prompt(prefill_reqs)
|
||||
(
|
||||
decode_input_tokens,
|
||||
decode_input_positions,
|
||||
decode_attn_metadata,
|
||||
decode_lora_index_mapping,
|
||||
decode_lora_prompt_mapping,
|
||||
decode_lora_requests,
|
||||
decode_slot_mapping,
|
||||
) = self._prepare_decode(decode_reqs)
|
||||
sampling_metadata = SamplingMetadata.prepare(seq_group_metadata_list, seq_lens, query_lens, self.device,
|
||||
self.pin_memory)
|
||||
|
||||
if not self.scheduler_config.chunked_prefill_enabled:
|
||||
assert (len(prefill_reqs) and len(decode_reqs)) == 0
|
||||
|
||||
num_prefills = len(seq_lens)
|
||||
num_prefill_tokens = len(input_tokens)
|
||||
num_decode_tokens = len(decode_input_tokens)
|
||||
|
||||
# Coalesce tensors. Note that attn_metadata is currently not
|
||||
# coalesced for simplicity.
|
||||
input_tokens.extend(decode_input_tokens)
|
||||
input_positions.extend(decode_input_positions)
|
||||
slot_mapping.extend(decode_slot_mapping)
|
||||
lora_index_mapping.extend(decode_lora_index_mapping)
|
||||
lora_prompt_mapping.extend(decode_lora_prompt_mapping)
|
||||
lora_requests.update(decode_lora_requests)
|
||||
|
||||
input_tokens = torch.tensor(input_tokens, dtype=torch.long, device=self.device)
|
||||
input_positions = torch.tensor(input_positions, dtype=torch.long, device=self.device)
|
||||
slot_mapping = torch.tensor(slot_mapping, dtype=torch.long, device=self.device)
|
||||
|
||||
if self.lora_config:
|
||||
lora_mapping = LoRAMapping(
|
||||
lora_index_mapping,
|
||||
lora_prompt_mapping,
|
||||
)
|
||||
else:
|
||||
lora_mapping = None
|
||||
|
||||
# Broadcast the metadata.
|
||||
# If batch contains both prefill and decode, it sends 2 broadcasts.
|
||||
# If it only contains 1 type, it triggers a single broadcast.
|
||||
if (prefill_attn_metadata is not None and decode_attn_metadata is not None):
|
||||
batch_type = BatchType.MIXED
|
||||
elif prefill_attn_metadata is not None:
|
||||
batch_type = BatchType.PREFILL
|
||||
else:
|
||||
batch_type = BatchType.DECODE
|
||||
|
||||
attn_metadata = AttentionMetadata(
|
||||
num_prefills=num_prefills,
|
||||
slot_mapping=slot_mapping,
|
||||
num_prefill_tokens=num_prefill_tokens,
|
||||
num_decode_tokens=num_decode_tokens,
|
||||
prefill_metadata=prefill_attn_metadata,
|
||||
decode_metadata=decode_attn_metadata,
|
||||
kv_cache_dtype=self.kv_cache_dtype,
|
||||
)
|
||||
|
||||
return (input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping,
|
||||
multi_modal_input)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
kv_caches: List[torch.Tensor],
|
||||
) -> Optional[SamplerOutput]:
|
||||
(input_tokens, input_positions, attn_metadata, sampling_metadata, lora_requests, lora_mapping,
|
||||
multi_modal_input) = self.prepare_input_tensors(seq_group_metadata_list)
|
||||
|
||||
if self.lora_config:
|
||||
self.set_active_loras(lora_requests, lora_mapping)
|
||||
|
||||
# Currently cuda graph is only supported by the decode phase.
|
||||
prefill_meta = attn_metadata.prefill_metadata
|
||||
decode_meta = attn_metadata.decode_metadata
|
||||
if prefill_meta is None and decode_meta.use_cuda_graph:
|
||||
graph_batch_size = input_tokens.shape[0]
|
||||
model_executable = self.graph_runners[graph_batch_size]
|
||||
else:
|
||||
model_executable = self.model
|
||||
execute_model_kwargs = {
|
||||
"input_ids": input_tokens,
|
||||
"positions": input_positions,
|
||||
"kv_caches": kv_caches,
|
||||
"attn_metadata": attn_metadata,
|
||||
}
|
||||
if self.vision_language_config:
|
||||
execute_model_kwargs.update({"image_input": multi_modal_input})
|
||||
hidden_states = model_executable(**execute_model_kwargs)
|
||||
|
||||
# Compute the logits.
|
||||
logits = self.model.compute_logits(hidden_states, sampling_metadata)
|
||||
|
||||
# Only perform sampling in the driver worker.
|
||||
# if not self.is_driver_worker:
|
||||
# return None
|
||||
|
||||
# TODO(sgm): perform sampling on rank 0
|
||||
# Sample the next token.
|
||||
output = self.model.sample(
|
||||
logits=logits,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
|
||||
return output
|
||||
294
verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
vendored
Normal file
294
verl/third_party/vllm/vllm_v_0_4_2/parallel_state.py
vendored
Normal file
@@ -0,0 +1,294 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Adapted from
|
||||
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
"""Model and data parallel groups."""
|
||||
import os
|
||||
import torch
|
||||
import torch.distributed
|
||||
from typing import Optional
|
||||
|
||||
import vllm.distributed.parallel_state as ps
|
||||
|
||||
import vllm.envs as envs
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from torch.distributed.device_mesh import init_device_mesh
|
||||
|
||||
logger = init_logger(__name__)
|
||||
"""
|
||||
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
|
||||
- We assume the Megatron tp+dp+pp world is already established before calling this function.
|
||||
|
||||
"""
|
||||
|
||||
# Device mesh for using DTensor
|
||||
_DEVICE_MESH = None
|
||||
|
||||
# Tensor model parallel group that the current rank belongs to.
|
||||
_TP_DEVICE_GROUP = None
|
||||
_TP_CPU_GROUP = None
|
||||
|
||||
|
||||
# This method is for initializing the ParallelGroup when using HybridEngine
|
||||
def initialize_parallel_state(
|
||||
distributed_init_method: str = "env://",
|
||||
backend: str = "nccl",
|
||||
tensor_model_parallel_size: int = 1,
|
||||
num_tp_per_train_tp: int = 1,
|
||||
pipeline_model_parallel_size: int = 1,
|
||||
):
|
||||
# torch.distributed.all_reduce does not free the input tensor until
|
||||
# the synchronization point. This causes the memory usage to grow
|
||||
# as the number of all_reduce calls increases. This env var disables
|
||||
# this behavior.
|
||||
# Related issue:
|
||||
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
|
||||
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
|
||||
rank = int(os.getenv("RANK", "-1"))
|
||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||
|
||||
# Use the world_size set by TORCHRUN
|
||||
world_size = int(os.getenv("WORLD_SIZE", "-1"))
|
||||
assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
|
||||
ps.init_distributed_environment(world_size, rank, distributed_init_method, local_rank, backend)
|
||||
if torch.distributed.get_world_size() > 1:
|
||||
# NOTE: build a sepearate inference group with infer tp & micro dp
|
||||
initialize_model_parallel_for_vllm(tensor_model_parallel_size=tensor_model_parallel_size,
|
||||
num_tensor_model_parallel_groups_per_train_tp=num_tp_per_train_tp)
|
||||
else:
|
||||
initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
|
||||
|
||||
|
||||
def ensure_model_parallel_initialized(
|
||||
tensor_model_parallel_size: int,
|
||||
pipeline_model_parallel_size: int = 1,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Helper to initialize model parallel groups if they are not initialized,
|
||||
or ensure tensor-parallel and pipeline-parallel sizes are equal to expected
|
||||
values if the model parallel groups are initialized.
|
||||
"""
|
||||
# get the backend of _DEVICE_WORLD_GROUP
|
||||
backend = backend or torch.distributed.get_backend()
|
||||
if not model_parallel_is_initialized():
|
||||
initialize_model_parallel(tensor_model_parallel_size, pipeline_model_parallel_size, backend)
|
||||
return
|
||||
|
||||
assert (get_tensor_model_parallel_world_size() == tensor_model_parallel_size), (
|
||||
"tensor parallel group already initialized, but of unexpected size: "
|
||||
f"{get_tensor_model_parallel_world_size()=} vs. "
|
||||
f"{tensor_model_parallel_size=}")
|
||||
# assert (get_pipeline_model_parallel_world_size(
|
||||
# ) == pipeline_model_parallel_size), (
|
||||
# "pipeline parallel group already initialized, but of unexpected size: "
|
||||
# f"{get_pipeline_model_parallel_world_size()=} vs. "
|
||||
# f"{pipeline_model_parallel_size=}")
|
||||
|
||||
|
||||
def model_parallel_is_initialized():
|
||||
"""Check if tensor and pipeline parallel groups are initialized."""
|
||||
return (ps._TP_DEVICE_GROUP is not None)
|
||||
# and _PIPELINE_MODEL_PARALLEL_GROUP is not None)
|
||||
|
||||
|
||||
def initialize_model_parallel_for_vllm(tensor_model_parallel_size: int,
|
||||
num_tensor_model_parallel_groups_per_train_tp: int = 1) -> None:
|
||||
from torch.distributed import new_group
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
|
||||
assert isinstance(tensor_model_parallel_size, int)
|
||||
|
||||
# assert num_tensor_model_parallel_groups_per_train_tp == 1 and not different_tp_group
|
||||
# assert num_tensor_model_parallel_groups_per_train_tp > 1 and different_tp_group
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
assert ps._TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
|
||||
|
||||
global _TP_DEVICE_GROUP
|
||||
global _TP_CPU_GROUP
|
||||
global _DEVICE_MESH
|
||||
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
|
||||
rank = torch.distributed.get_rank()
|
||||
|
||||
backend = torch.distributed.get_backend()
|
||||
|
||||
num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
|
||||
|
||||
if num_tensor_model_parallel_groups_per_train_tp == 1:
|
||||
# if tensor_model_parallel_size == train_tensor_parallel_size:
|
||||
# using the same tp group as Megatron/vllm
|
||||
for i in range(num_tensor_model_parallel_groups):
|
||||
ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
|
||||
group = torch.distributed.new_group(ranks, backend=backend)
|
||||
cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
if rank in ranks:
|
||||
_TP_DEVICE_GROUP = group
|
||||
_TP_CPU_GROUP = cpu_group
|
||||
ps._TP_DEVICE_GROUP = group
|
||||
ps._TP_CPU_GROUP = cpu_group
|
||||
|
||||
# no _MICRO_DATA_PARALLEL_GROUP
|
||||
else:
|
||||
# initialize a micro_dp group and a tp group
|
||||
# assume training tp=4, infer tp=2, then, weight is partitioned as
|
||||
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
|
||||
|
||||
# Build the inference tp groups
|
||||
# train_tp = train_tensor_parallel_size
|
||||
train_tp = num_tensor_model_parallel_groups_per_train_tp * tensor_model_parallel_size
|
||||
# num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
|
||||
assert _TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
|
||||
for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
|
||||
start = train_tp * i
|
||||
end = train_tp * (i + 1)
|
||||
for j in range(num_tensor_model_parallel_groups_per_train_tp):
|
||||
ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
|
||||
for i in range(len(ranks)):
|
||||
ranks[i] += j
|
||||
group = torch.distributed.new_group(ranks)
|
||||
cpu_group = torch.distributed.new_group(ranks, backend='gloo')
|
||||
if rank in ranks:
|
||||
_TP_DEVICE_GROUP = group
|
||||
_TP_CPU_GROUP = cpu_group
|
||||
ps._TP_DEVICE_GROUP = _TP_DEVICE_GROUP
|
||||
ps._TP_CPU_GROUP = cpu_group
|
||||
|
||||
# Build the pipeline model-parallel groups.
|
||||
# global _PIPELINE_MODEL_PARALLEL_GROUP
|
||||
# global _PIPELINE_GLOBAL_RANKS
|
||||
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
|
||||
|
||||
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
|
||||
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
|
||||
|
||||
|
||||
def initialize_model_parallel(
|
||||
tensor_model_parallel_size: int = 1,
|
||||
pipeline_model_parallel_size: int = 1,
|
||||
backend: Optional[str] = None,
|
||||
) -> None:
|
||||
"""
|
||||
NOTE: This method is a hack from the open-sourced version without
|
||||
asertion of world_size = tp * pp
|
||||
|
||||
Initialize model parallel groups.
|
||||
|
||||
Arguments:
|
||||
tensor_model_parallel_size: number of GPUs used for tensor model
|
||||
parallelism.
|
||||
pipeline_model_parallel_size: number of GPUs used for pipeline model
|
||||
parallelism.
|
||||
|
||||
Let's say we have a total of 8 GPUs denoted by g0 ... g7 and we
|
||||
use 2 GPUs to parallelize the model tensor, and 4 GPUs to parallelize
|
||||
the model pipeline. The present function will
|
||||
create 4 tensor model-parallel groups and 2 pipeline model-parallel groups:
|
||||
4 tensor model-parallel groups:
|
||||
[g0, g1], [g2, g3], [g4, g5], [g6, g7]
|
||||
2 pipeline model-parallel groups:
|
||||
[g0, g2, g4, g6], [g1, g3, g5, g7]
|
||||
Note that for efficiency, the caller should make sure adjacent ranks
|
||||
are on the same DGX box. For example if we are using 2 DGX-1 boxes
|
||||
with a total of 16 GPUs, rank 0 to 7 belong to the first box and
|
||||
ranks 8 to 15 belong to the second box.
|
||||
"""
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
# get the backend of _DEVICE_WORLD_GROUP
|
||||
backend = backend or torch.distributed.get_backend()
|
||||
|
||||
# NOTE(sgm) we don't assert world_size == tp * pp
|
||||
# DP is not managed by vllm but by the veRL WorkerGroup
|
||||
|
||||
num_tensor_model_parallel_groups: int = (world_size // tensor_model_parallel_size)
|
||||
num_pipeline_model_parallel_groups: int = (world_size // pipeline_model_parallel_size)
|
||||
rank = torch.distributed.get_rank()
|
||||
|
||||
# Build device mesh for TP
|
||||
if num_tensor_model_parallel_groups > 1:
|
||||
device_mesh = init_device_mesh("cuda", (num_tensor_model_parallel_groups, tensor_model_parallel_size),
|
||||
mesh_dim_names=("replicate", "tp_shard"))
|
||||
else:
|
||||
device_mesh = init_device_mesh("cuda", (tensor_model_parallel_size,), mesh_dim_names=["tp_shard"])
|
||||
shard_group = device_mesh.get_group(mesh_dim="tp_shard")
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
global _TP_DEVICE_GROUP, _TP_CPU_GROUP
|
||||
global _DEVICE_MESH
|
||||
assert _TP_DEVICE_GROUP is None, ("tensor model parallel group is already initialized")
|
||||
assert _DEVICE_MESH is None, ("device mesh in vllm is already initialized")
|
||||
|
||||
_DEVICE_MESH = device_mesh
|
||||
# for i in range(num_tensor_model_parallel_groups):
|
||||
# ranks = range(i * tensor_model_parallel_size, (i + 1) * tensor_model_parallel_size)
|
||||
# group = torch.distributed.new_group(ranks, backend=backend)
|
||||
# cpu_group = torch.distributed.new_group(ranks, backend="gloo")
|
||||
# assert torch.distributed.get_process_group_ranks(shard_group) == torch.distributed.get_process_group_ranks(cpu_group)
|
||||
# ranks = torch.distributed.get_process_group_ranks(shard_group)
|
||||
# cpu_group = torch.distributed.new_group(ranks, backend="gloo") # TODO: this will hang
|
||||
# cpu_group = torch.distributed.new_group(, backend="gloo")
|
||||
# if rank == 0:
|
||||
# print(f'rank: {rank}')
|
||||
# print(f'ranks: {ranks}')
|
||||
# print(f'torch.distributed.get_process_group_ranks(shard_group): {torch.distributed.get_process_group_ranks(shard_group)}')
|
||||
# if rank in ranks:
|
||||
_TP_DEVICE_GROUP = shard_group
|
||||
ps._TP_DEVICE_GROUP = _TP_DEVICE_GROUP
|
||||
# ps._TP_CPU_GROUP = cpu_group # TODO: will hang when used with device mesh
|
||||
|
||||
# TODO: init using device mesh
|
||||
# Build the pipeline model-parallel groups.
|
||||
assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
|
||||
for i in range(num_pipeline_model_parallel_groups):
|
||||
ranks = range(i, world_size, num_pipeline_model_parallel_groups)
|
||||
group = torch.distributed.new_group(ranks, backend=backend)
|
||||
if rank in ranks:
|
||||
ps._PIPELINE_MODEL_PARALLEL_GROUP = group
|
||||
ps._PIPELINE_GLOBAL_RANKS = ranks
|
||||
|
||||
|
||||
"""
|
||||
Device mesh utilities
|
||||
"""
|
||||
|
||||
|
||||
def get_device_mesh():
|
||||
assert _DEVICE_MESH is not None, ("device mesh is not initialized")
|
||||
return _DEVICE_MESH
|
||||
|
||||
|
||||
"""
|
||||
Tensor model parallel utilities
|
||||
"""
|
||||
|
||||
|
||||
def get_tensor_model_parallel_group():
|
||||
"""Get the tensor model parallel group the caller rank belongs to."""
|
||||
assert _TP_DEVICE_GROUP is not None, ("tensor model parallel group is not initialized")
|
||||
return _TP_DEVICE_GROUP
|
||||
|
||||
|
||||
def get_tensor_model_parallel_world_size():
|
||||
"""Return world size for the tensor model parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
|
||||
|
||||
|
||||
def get_tensor_model_parallel_rank():
|
||||
"""Return my rank for the tensor model parallel group."""
|
||||
return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
|
||||
|
||||
|
||||
def get_tensor_model_parallel_src_rank():
|
||||
"""Calculate the global rank corresponding to the first local rank
|
||||
in the tensor model parallel group."""
|
||||
global_rank = torch.distributed.get_rank()
|
||||
local_world_size = get_tensor_model_parallel_world_size()
|
||||
return (global_rank // local_world_size) * local_world_size
|
||||
218
verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
vendored
Normal file
218
verl/third_party/vllm/vllm_v_0_4_2/spmd_gpu_executor.py
vendored
Normal file
@@ -0,0 +1,218 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
|
||||
import os
|
||||
import socket
|
||||
from typing import Any, Dict, List, Optional, Set, Tuple
|
||||
|
||||
import torch
|
||||
import vllm.envs as envs
|
||||
from vllm.executor.executor_base import ExecutorBase, ExecutorAsyncBase
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.sequence import SamplerOutput, ExecuteModelRequest
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, SpeculativeConfig,
|
||||
VisionLanguageConfig)
|
||||
from .config import ModelConfig, LoadConfig
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
class SPMDGPUExecutor(ExecutorBase):
|
||||
"""SPMD-based multi-GPU executor implementations."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model, # pytorch model itself or its parameter dict
|
||||
model_config: ModelConfig,
|
||||
cache_config: CacheConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
load_config: LoadConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
vision_language_config: Optional[VisionLanguageConfig],
|
||||
speculative_config: Optional[SpeculativeConfig],
|
||||
) -> None:
|
||||
self.model_config = model_config
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
self.load_config = load_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.vision_language_config = vision_language_config
|
||||
self.speculative_config = speculative_config
|
||||
|
||||
distributed_init_method = initialize_cluster(parallel_config)
|
||||
self._init_executor(model, distributed_init_method)
|
||||
|
||||
# TODO(sgm): verl not support speculative decode now
|
||||
def _init_executor(self, model, distributed_init_method) -> None:
|
||||
assert (not self.speculative_config), "Speculative decoding not yet supported for multi-GPU backend."
|
||||
|
||||
# Create the parallel worker for each GPU.
|
||||
self._init_workers_sp(model, distributed_init_method)
|
||||
|
||||
def _init_workers_sp(self, model, distributed_init_method: str):
|
||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||
from .worker import Worker # pylint: disable=import-outside-toplevel
|
||||
|
||||
rank = int(os.getenv("RANK"))
|
||||
local_rank = int(os.getenv("LOCAL_RANK"))
|
||||
print(f'local rank {local_rank}')
|
||||
|
||||
self.worker = Worker(
|
||||
model,
|
||||
self.model_config,
|
||||
self.parallel_config,
|
||||
self.scheduler_config,
|
||||
self.device_config,
|
||||
self.cache_config,
|
||||
self.load_config,
|
||||
local_rank,
|
||||
rank,
|
||||
distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
vision_language_config=self.vision_language_config,
|
||||
)
|
||||
|
||||
# NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
|
||||
self.worker.init_device()
|
||||
self.worker.load_model()
|
||||
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Determine the number of available KV blocks.
|
||||
|
||||
This invokes `determine_num_available_blocks` on each worker and takes
|
||||
the min of the results, guaranteeing that the selected cache sizes are
|
||||
compatible with all workers.
|
||||
|
||||
Returns:
|
||||
- tuple[num_gpu_blocks, num_cpu_blocks]
|
||||
"""
|
||||
# Get the maximum number of blocks that can be allocated on GPU and CPU.
|
||||
num_blocks = self.worker.determine_num_available_blocks()
|
||||
|
||||
# NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
|
||||
# have its own scheduler
|
||||
num_gpu_blocks = num_blocks[0]
|
||||
num_cpu_blocks = num_blocks[1]
|
||||
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def initialize_cache(self, num_gpu_blocks: int, num_cpu_blocks: int) -> None:
|
||||
"""Initialize the KV cache in all workers.
|
||||
"""
|
||||
|
||||
# NOTE: We log here to avoid multiple logs when number of workers is
|
||||
# greater than one. We could log in the engine, but not all executors
|
||||
# have GPUs.
|
||||
logger.info("# GPU blocks: %d, # CPU blocks: %d", num_gpu_blocks, num_cpu_blocks)
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(
|
||||
f'before init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
|
||||
)
|
||||
self.worker.initialize_cache(num_gpu_blocks=num_gpu_blocks, num_cpu_blocks=num_cpu_blocks)
|
||||
if torch.distributed.get_rank() == 0:
|
||||
print(
|
||||
f'after init cache memory allocated: {torch.cuda.memory_allocated() / 1e9}GB, reserved: {torch.cuda.memory_reserved() / 1e9}GB'
|
||||
)
|
||||
|
||||
# NOTE(sgm): This will not profile & capture the model(CUDAGraph) when rebuilding KVCache
|
||||
def init_cache_engine(self) -> None:
|
||||
self.worker._init_cache_engine()
|
||||
|
||||
def free_cache_engine(self) -> None:
|
||||
self.worker.free_cache_engine()
|
||||
|
||||
def execute_model(self, execute_model_req) -> List[SamplerOutput]:
|
||||
all_outputs = self.worker.execute_model(execute_model_req=execute_model_req)
|
||||
|
||||
# NOTE(sgm):
|
||||
# Each GPU in vllm under verl has its own spmd_gpu_executor, therefore all GPUs should return the outputs
|
||||
# In vllm with ray, only the driver worker returns the sampling results.
|
||||
return all_outputs
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self.worker.add_lora(lora_request=lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.worker.remove_lora(lora_id=lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.worker.list_loras()
|
||||
|
||||
def check_health(self) -> None:
|
||||
# SPMDExecutor will always be healthy as long as
|
||||
# it's running.
|
||||
return
|
||||
|
||||
# NOTE(sgm): add for verl
|
||||
def offload_model_weights(self) -> None:
|
||||
self.worker.offload_model_weights()
|
||||
|
||||
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor], load_format: str) -> None:
|
||||
self.worker.sync_model_weights(actor_weights=actor_weights, load_format=load_format)
|
||||
|
||||
|
||||
def initialize_cluster(
|
||||
parallel_config: ParallelConfig,
|
||||
engine_use_ray: bool = False,
|
||||
ray_address: Optional[str] = None,
|
||||
) -> Tuple[str, Optional[None]]:
|
||||
"""Initialize the distributed cluster probably with Ray.
|
||||
|
||||
Args:
|
||||
parallel_config: The configurations for parallel execution.
|
||||
|
||||
Returns:
|
||||
The `distributed_init_method` is the address for initializing the
|
||||
distributed backend.
|
||||
"""
|
||||
|
||||
# Initialize cluster locally.
|
||||
port = get_open_port()
|
||||
# We need to setup the distributed init method to make sure
|
||||
# the distributed megatron code (e.g., get world size) works correctly.
|
||||
# distributed_init_method = f"tcp://localhost:{port}"
|
||||
distributed_init_method = 'env://'
|
||||
return distributed_init_method
|
||||
|
||||
|
||||
def get_open_port():
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", 0))
|
||||
return s.getsockname()[1]
|
||||
|
||||
|
||||
# TODO(sgm): not implemented async executor yet
|
||||
class SPMDGPUExecutorAsync(SPMDGPUExecutor, ExecutorAsyncBase):
|
||||
|
||||
async def execute_model_async(self, execute_model_req: ExecuteModelRequest) -> List[SamplerOutput]:
|
||||
"""Executes one model step on the given sequences."""
|
||||
raise NotImplementedError
|
||||
|
||||
async def check_health_async(self) -> None:
|
||||
"""Checks if the executor is healthy. If not, it should raise an
|
||||
exception."""
|
||||
self.check_health()
|
||||
77
verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
vendored
Normal file
77
verl/third_party/vllm/vllm_v_0_4_2/tokenizer.py
vendored
Normal file
@@ -0,0 +1,77 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import make_async, LRUCache
|
||||
from vllm.transformers_utils.tokenizers import *
|
||||
|
||||
|
||||
class TokenizerGroup:
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int]):
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = tokenizer
|
||||
self.lora_tokenizers = LRUCache[PreTrainedTokenizer](capacity=max_num_seqs) if enable_lora else None
|
||||
|
||||
def ping(self) -> bool:
|
||||
"""Check if the tokenizer group is alive."""
|
||||
return True
|
||||
|
||||
def get_max_input_len(self, lora_request: Optional[LoRARequest] = None) -> Optional[int]:
|
||||
"""Get the maximum input length for the LoRA request."""
|
||||
return self.max_input_length
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
async def encode_async(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
# TODO(sgm): the lora tokenizer is also passed, but may be different
|
||||
tokenizer = self.tokenizer
|
||||
# tokenizer = (get_lora_tokenizer(
|
||||
# lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers.get(lora_request.lora_int_id)
|
||||
|
||||
# FIXME(sgm): for simplicity, we assign the special token here
|
||||
@property
|
||||
def pad_token_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
292
verl/third_party/vllm/vllm_v_0_4_2/worker.py
vendored
Normal file
292
verl/third_party/vllm/vllm_v_0_4_2/worker.py
vendored
Normal file
@@ -0,0 +1,292 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
|
||||
"""A GPU worker class."""
|
||||
import os
|
||||
import gc
|
||||
from typing import Dict, List, Tuple, Optional, Union
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, LoRAConfig, ParallelConfig, SchedulerConfig, VisionLanguageConfig)
|
||||
from vllm.model_executor import set_random_seed
|
||||
from vllm.sequence import SamplerOutput, ExecuteModelRequest
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.distributed.device_communicators import pynccl_utils
|
||||
from vllm.distributed.device_communicators.custom_all_reduce import (init_custom_ar)
|
||||
# TODO(sgm): check why vllm has similar file in vllm.model_executor.parallel_utils.parallel_state
|
||||
from vllm.distributed import get_tensor_model_parallel_cpu_group, init_distributed_environment, get_tensor_model_parallel_group
|
||||
from vllm.worker.worker import Worker, _check_if_gpu_supports_dtype
|
||||
|
||||
from .model_runner import ModelRunner
|
||||
from .megatron_weight_loaders import load_megatron_weights
|
||||
from .hf_weight_loader import load_hf_weights
|
||||
from .dtensor_weight_loaders import load_dtensor_weights
|
||||
from .parallel_state import (ensure_model_parallel_initialized)
|
||||
from .config import ModelConfig, LoadConfig, LoadFormat
|
||||
|
||||
|
||||
class Worker(Worker):
|
||||
"""A worker class that executes (a partition of) the model on a GPU.
|
||||
|
||||
Each worker is associated with a single GPU. The worker is responsible for
|
||||
maintaining the KV cache and executing the model on the GPU. In case of
|
||||
distributed inference, each worker is assigned a partition of the model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
cache_config: CacheConfig,
|
||||
load_config: LoadConfig,
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
distributed_init_method: str,
|
||||
lora_config: Optional[LoRAConfig] = None,
|
||||
vision_language_config: Optional[VisionLanguageConfig] = None,
|
||||
is_driver_worker: bool = False,
|
||||
) -> None:
|
||||
# self.model = model # will be replaced in the init_model
|
||||
self.model_config = model_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.cache_config = cache_config
|
||||
self.local_rank = local_rank
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.lora_config = lora_config
|
||||
self.load_config = load_config
|
||||
self.is_driver_worker = is_driver_worker
|
||||
if self.is_driver_worker:
|
||||
assert self.rank == 0, "The driver worker must have rank 0."
|
||||
|
||||
self.vision_language_config = vision_language_config
|
||||
if self.vision_language_config:
|
||||
assert not self.lora_config, ("To be tested: vision language model with LoRA settings.")
|
||||
|
||||
self.model_runner = ModelRunner(
|
||||
model,
|
||||
model_config,
|
||||
parallel_config,
|
||||
scheduler_config,
|
||||
device_config,
|
||||
load_config=load_config,
|
||||
lora_config=self.lora_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
vision_language_config=vision_language_config,
|
||||
)
|
||||
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# init_cache_engine.
|
||||
self.cache_engine: CacheEngine = None
|
||||
self.gpu_cache: List[torch.Tensor] = None
|
||||
|
||||
# NOTE(sgm): For offloading inference engine params
|
||||
self.cpu_model = None
|
||||
|
||||
def init_device(self) -> None:
|
||||
if self.device_config.device.type == "cuda":
|
||||
# torch.distributed.all_reduce does not free the input tensor until
|
||||
# the synchronization point. This causes the memory usage to grow
|
||||
# as the number of all_reduce calls increases. This env var disables
|
||||
# this behavior.
|
||||
# Related issue:
|
||||
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
|
||||
# NOTE(sgm): Modify for verl, Env vars will be set by TORCHRUN.
|
||||
self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
|
||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||
self.device = torch.device(f"cuda:{local_rank}")
|
||||
if self.rank < 0:
|
||||
raise ValueError("Invalid or unspecified rank.")
|
||||
torch.cuda.set_device(self.device)
|
||||
|
||||
# Use the world_size set by TORCHRUN
|
||||
world_size = int(os.getenv("WORLD_SIZE", "-1"))
|
||||
assert world_size != -1, "The world_size is set to -1, not initialized by TORCHRUN"
|
||||
self.parallel_config.world_size = world_size
|
||||
|
||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||
torch.cuda.empty_cache()
|
||||
self.init_gpu_memory = torch.cuda.mem_get_info()[0]
|
||||
else:
|
||||
raise RuntimeError(f"Not support device type: {self.device_config.device}")
|
||||
|
||||
# Initialize the distributed environment.
|
||||
init_worker_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method,
|
||||
self.local_rank)
|
||||
# Set random seed.
|
||||
set_random_seed(self.model_config.seed)
|
||||
# self.model = get_model(actor_model=self.model, model_config=self.model_config)
|
||||
|
||||
@torch.inference_mode()
|
||||
def determine_num_available_blocks(self) -> Tuple[int, int]:
|
||||
"""Profiles the peak memory usage of the model to determine how many
|
||||
KV blocks may be allocated without OOMs.
|
||||
|
||||
The engine will first conduct a profiling of the existing memory usage.
|
||||
Then, it calculate the maximum possible number of GPU and CPU blocks
|
||||
that can be allocated with the remaining free memory.
|
||||
|
||||
.. tip::
|
||||
You may limit the usage of GPU memory
|
||||
by adjusting the `gpu_memory_utilization` parameter.
|
||||
"""
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
torch.cuda.empty_cache()
|
||||
# torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
self.model_runner.profile_run()
|
||||
|
||||
# Calculate the number of blocks that can be allocated with the
|
||||
# profiled peak memory.
|
||||
torch.cuda.synchronize()
|
||||
free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
|
||||
peak_memory = total_gpu_memory - free_gpu_memory
|
||||
|
||||
assert peak_memory > 0, ("Error in memory profiling. This happens when the GPU memory was "
|
||||
"not properly cleaned up before initializing the vLLM instance.")
|
||||
|
||||
cache_block_size = self.get_cache_block_size_bytes()
|
||||
|
||||
# NOTE(sgm) use the remaining memory
|
||||
num_gpu_blocks = int((free_gpu_memory * self.cache_config.gpu_memory_utilization) // cache_block_size)
|
||||
# num_gpu_blocks = int((total_gpu_memory * self.cache_config.gpu_memory_utilization - peak_memory) // cache_block_size)
|
||||
|
||||
num_cpu_blocks = int(self.cache_config.swap_space_bytes // cache_block_size)
|
||||
num_gpu_blocks = max(num_gpu_blocks, 0)
|
||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
if self.model_runner.lora_manager:
|
||||
self.model_runner.remove_all_loras()
|
||||
|
||||
# NOTE(sgm): Add for verl, synchronize number of blocks with all the rank
|
||||
num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
|
||||
num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
|
||||
torch.distributed.all_reduce(num_gpu_blocks,
|
||||
op=torch.distributed.ReduceOp.MIN,
|
||||
group=get_tensor_model_parallel_group())
|
||||
torch.distributed.all_reduce(num_cpu_blocks,
|
||||
op=torch.distributed.ReduceOp.MIN,
|
||||
group=get_tensor_model_parallel_group())
|
||||
num_gpu_blocks = num_gpu_blocks.item()
|
||||
num_cpu_blocks = num_cpu_blocks.item()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def _init_cache_engine(self):
|
||||
if self.cache_engine is None and self.gpu_cache is None:
|
||||
super()._init_cache_engine()
|
||||
|
||||
def free_cache_engine(self):
|
||||
# ensure `enforce_eager=True`
|
||||
self.cache_engine = None
|
||||
self.gpu_cache = None
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(self, execute_model_req: Optional[ExecuteModelRequest] = None) -> List[SamplerOutput]:
|
||||
|
||||
if execute_model_req is None:
|
||||
seq_group_metadata_list = None
|
||||
else:
|
||||
seq_group_metadata_list = execute_model_req.seq_group_metadata_list
|
||||
|
||||
# NOTE(sgm): each SPMD rank will have identical input
|
||||
assert seq_group_metadata_list is not None
|
||||
assert execute_model_req is not None
|
||||
num_seq_groups = len(seq_group_metadata_list)
|
||||
blocks_to_swap_in = execute_model_req.blocks_to_swap_in
|
||||
blocks_to_swap_out = execute_model_req.blocks_to_swap_out
|
||||
blocks_to_copy = execute_model_req.blocks_to_copy
|
||||
|
||||
self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
|
||||
|
||||
# If there is no input, we don't need to execute the model.
|
||||
if num_seq_groups == 0:
|
||||
return []
|
||||
|
||||
output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache)
|
||||
|
||||
# Worker only supports single-step execution. Wrap the output in a list
|
||||
# to conform to interface.
|
||||
return [output]
|
||||
|
||||
# assume the input is .state_dict()
|
||||
def sync_model_weights(self, actor_weights: Dict, load_format: str):
|
||||
if load_format in [LoadFormat.MEGATRON, LoadFormat.AUTO]:
|
||||
load_megatron_weights(actor_weights, self.model_runner.model)
|
||||
elif load_format == LoadFormat.HF:
|
||||
# full model state dict without no sharding
|
||||
load_hf_weights(actor_weights, self.model_runner.model)
|
||||
elif load_format == LoadFormat.DTENSOR:
|
||||
load_dtensor_weights(actor_weights, self.model_runner.model)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
if self.cpu_model == None:
|
||||
self.cpu_model = {}
|
||||
for name, params in self.model_runner.model.named_parameters():
|
||||
self.cpu_model[name] = torch.empty_like(params, device='cpu')
|
||||
params.data = self.cpu_model[name]
|
||||
else:
|
||||
for name, params in self.model_runner.model.named_parameters():
|
||||
params.data = self.cpu_model[name]
|
||||
|
||||
|
||||
def init_worker_distributed_environment(
|
||||
parallel_config: ParallelConfig,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = "env://",
|
||||
local_rank: int = -1,
|
||||
) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
# NOTE(sgm) use tcp://localhost:xxxx will hang in HF setting without megatron
|
||||
init_distributed_environment(parallel_config.world_size, rank, distributed_init_method, local_rank)
|
||||
|
||||
ensure_model_parallel_initialized(tensor_model_parallel_size=parallel_config.tensor_parallel_size,
|
||||
pipeline_model_parallel_size=parallel_config.pipeline_parallel_size)
|
||||
|
||||
# TODO(sgm): check whether need this
|
||||
# if pynccl_utils.is_initialized():
|
||||
# pynccl_world_size = pynccl_utils.get_world_size()
|
||||
# if pynccl_world_size != parallel_config.world_size:
|
||||
# raise RuntimeError(
|
||||
# "pynccl is already initialized but the pynccl world "
|
||||
# "size does not match parallel_config.world_size "
|
||||
# f"({pynccl_world_size} vs. {parallel_config.world_size}).")
|
||||
# elif parallel_config.world_size > 1:
|
||||
# # NOTE(woosuk): We don't initialize pynccl process group when world size
|
||||
# # is 1.
|
||||
# # NOTE(kaichao): By default, pynccl is initialized for tp group.
|
||||
# pynccl_utils.init_process_group(
|
||||
# group=get_tensor_model_parallel_cpu_group())
|
||||
|
||||
# # Initialize a custom fast all-reduce implementation.
|
||||
# if not parallel_config.disable_custom_all_reduce:
|
||||
# init_custom_ar()
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
torch.distributed.all_reduce(torch.zeros(1).cuda())
|
||||
# if pynccl_utils.is_initialized():
|
||||
# pynccl_utils.all_reduce(torch.zeros(1).cuda())
|
||||
Reference in New Issue
Block a user