Initial commit
This commit is contained in:
13
verl/third_party/vllm/vllm_v_0_3_1/__init__.py
vendored
Normal file
13
verl/third_party/vllm/vllm_v_0_3_1/__init__.py
vendored
Normal file
@@ -0,0 +1,13 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
228
verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
vendored
Normal file
228
verl/third_party/vllm/vllm_v_0_3_1/arg_utils.py
vendored
Normal file
@@ -0,0 +1,228 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/arg_utils.py
|
||||
import argparse
|
||||
import dataclasses
|
||||
from dataclasses import dataclass
|
||||
from typing import Dict, Optional, Tuple
|
||||
|
||||
import torch.nn as nn
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from transformers import PretrainedConfig
|
||||
from .config import ModelConfig
|
||||
|
||||
|
||||
@dataclass
|
||||
class EngineArgs:
|
||||
"""Arguments for vLLM engine."""
|
||||
model_hf_config: PretrainedConfig = None
|
||||
dtype: str = 'auto'
|
||||
kv_cache_dtype: str = 'auto'
|
||||
seed: int = 0
|
||||
max_model_len: Optional[int] = None
|
||||
worker_use_ray: bool = False
|
||||
pipeline_parallel_size: int = 1
|
||||
tensor_parallel_size: int = 1
|
||||
max_parallel_loading_workers: Optional[int] = None
|
||||
block_size: int = 16
|
||||
swap_space: int = 4 # GiB
|
||||
gpu_memory_utilization: float = 0.90
|
||||
max_num_batched_tokens: Optional[int] = None
|
||||
max_num_seqs: int = 256
|
||||
max_paddings: int = 256
|
||||
disable_log_stats: bool = False
|
||||
revision: Optional[str] = None
|
||||
tokenizer_revision: Optional[str] = None
|
||||
quantization: Optional[str] = None
|
||||
load_format: str = 'model'
|
||||
enforce_eager: bool = False
|
||||
max_context_len_to_capture: int = 8192
|
||||
disable_custom_all_reduce: bool = False
|
||||
enable_lora: bool = False
|
||||
max_loras: int = 1
|
||||
max_lora_rank: int = 16
|
||||
lora_extra_vocab_size: int = 256
|
||||
lora_dtype = 'auto'
|
||||
max_cpu_loras: Optional[int] = None
|
||||
device: str = 'cuda'
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||
"""Shared CLI arguments for vLLM engine."""
|
||||
# Model arguments
|
||||
# TODO(shengguangming): delete the unused args
|
||||
parser.add_argument('--model',
|
||||
type=str,
|
||||
default='facebook/opt-125m',
|
||||
help='name or path of the huggingface model to use')
|
||||
parser.add_argument('--tokenizer',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer,
|
||||
help='name or path of the huggingface tokenizer to use')
|
||||
parser.add_argument('--revision',
|
||||
type=str,
|
||||
default=None,
|
||||
help='the specific model version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument('--tokenizer-revision',
|
||||
type=str,
|
||||
default=None,
|
||||
help='the specific tokenizer version to use. It can be a branch '
|
||||
'name, a tag name, or a commit id. If unspecified, will use '
|
||||
'the default version.')
|
||||
parser.add_argument('--tokenizer-mode',
|
||||
type=str,
|
||||
default=EngineArgs.tokenizer_mode,
|
||||
choices=['auto', 'slow'],
|
||||
help='tokenizer mode. "auto" will use the fast '
|
||||
'tokenizer if available, and "slow" will '
|
||||
'always use the slow tokenizer.')
|
||||
parser.add_argument('--trust-remote-code', action='store_true', help='trust remote code from huggingface')
|
||||
parser.add_argument('--download-dir',
|
||||
type=str,
|
||||
default=EngineArgs.download_dir,
|
||||
help='directory to download and load the weights, '
|
||||
'default to the default cache dir of '
|
||||
'huggingface')
|
||||
parser.add_argument('--load-format',
|
||||
type=str,
|
||||
default=EngineArgs.load_format,
|
||||
choices=['auto', 'pt', 'safetensors', 'npcache', 'dummy'],
|
||||
help='The format of the model weights to load. '
|
||||
'"auto" will try to load the weights in the safetensors format '
|
||||
'and fall back to the pytorch bin format if safetensors format '
|
||||
'is not available. '
|
||||
'"pt" will load the weights in the pytorch bin format. '
|
||||
'"safetensors" will load the weights in the safetensors format. '
|
||||
'"npcache" will load the weights in pytorch format and store '
|
||||
'a numpy cache to speed up the loading. '
|
||||
'"dummy" will initialize the weights with random values, '
|
||||
'which is mainly for profiling.')
|
||||
parser.add_argument('--dtype',
|
||||
type=str,
|
||||
default=EngineArgs.dtype,
|
||||
choices=['auto', 'half', 'float16', 'bfloat16', 'float', 'float32'],
|
||||
help='data type for model weights and activations. '
|
||||
'The "auto" option will use FP16 precision '
|
||||
'for FP32 and FP16 models, and BF16 precision '
|
||||
'for BF16 models.')
|
||||
parser.add_argument('--max-model-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='model context length. If unspecified, '
|
||||
'will be automatically derived from the model.')
|
||||
# Parallel arguments
|
||||
parser.add_argument('--worker-use-ray',
|
||||
action='store_true',
|
||||
help='use Ray for distributed serving, will be '
|
||||
'automatically set when using more than 1 GPU')
|
||||
parser.add_argument('--pipeline-parallel-size',
|
||||
'-pp',
|
||||
type=int,
|
||||
default=EngineArgs.pipeline_parallel_size,
|
||||
help='number of pipeline stages')
|
||||
parser.add_argument('--tensor-parallel-size',
|
||||
'-tp',
|
||||
type=int,
|
||||
default=EngineArgs.tensor_parallel_size,
|
||||
help='number of tensor parallel replicas')
|
||||
# KV cache arguments
|
||||
parser.add_argument('--block-size',
|
||||
type=int,
|
||||
default=EngineArgs.block_size,
|
||||
choices=[8, 16, 32],
|
||||
help='token block size')
|
||||
# TODO(woosuk): Support fine-grained seeds (e.g., seed per request).
|
||||
parser.add_argument('--seed', type=int, default=EngineArgs.seed, help='random seed')
|
||||
parser.add_argument('--swap-space',
|
||||
type=int,
|
||||
default=EngineArgs.swap_space,
|
||||
help='CPU swap space size (GiB) per GPU')
|
||||
parser.add_argument('--gpu-memory-utilization',
|
||||
type=float,
|
||||
default=EngineArgs.gpu_memory_utilization,
|
||||
help='the percentage of GPU memory to be used for'
|
||||
'the model executor')
|
||||
parser.add_argument('--max-num-batched-tokens',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_batched_tokens,
|
||||
help='maximum number of batched tokens per '
|
||||
'iteration')
|
||||
parser.add_argument('--max-num-seqs',
|
||||
type=int,
|
||||
default=EngineArgs.max_num_seqs,
|
||||
help='maximum number of sequences per iteration')
|
||||
parser.add_argument('--disable-log-stats', action='store_true', help='disable logging statistics')
|
||||
# Quantization settings.
|
||||
parser.add_argument('--quantization',
|
||||
'-q',
|
||||
type=str,
|
||||
choices=['awq', None],
|
||||
default=None,
|
||||
help='Method used to quantize the weights')
|
||||
return parser
|
||||
|
||||
@classmethod
|
||||
def from_cli_args(cls, args: argparse.Namespace) -> 'EngineArgs':
|
||||
# Get the list of attributes of this dataclass.
|
||||
attrs = [attr.name for attr in dataclasses.fields(cls)]
|
||||
# Set the attributes from the parsed arguments.
|
||||
engine_args = cls(**{attr: getattr(args, attr) for attr in attrs})
|
||||
return engine_args
|
||||
|
||||
def create_engine_configs(
|
||||
self,
|
||||
) -> Tuple[ModelConfig, CacheConfig, ParallelConfig, SchedulerConfig]:
|
||||
device_config = DeviceConfig(self.device)
|
||||
model_config = ModelConfig(self.model_hf_config, self.dtype, self.seed, self.load_format, self.revision,
|
||||
self.tokenizer_revision, self.max_model_len, self.quantization, self.enforce_eager,
|
||||
self.max_context_len_to_capture)
|
||||
cache_config = CacheConfig(self.block_size, self.gpu_memory_utilization, self.swap_space, self.kv_cache_dtype,
|
||||
model_config.get_sliding_window())
|
||||
parallel_config = ParallelConfig(self.pipeline_parallel_size, self.tensor_parallel_size, self.worker_use_ray,
|
||||
self.max_parallel_loading_workers, self.disable_custom_all_reduce)
|
||||
scheduler_config = SchedulerConfig(self.max_num_batched_tokens, self.max_num_seqs, model_config.max_model_len,
|
||||
self.max_paddings)
|
||||
lora_config = LoRAConfig(max_lora_rank=self.max_lora_rank,
|
||||
max_loras=self.max_loras,
|
||||
lora_extra_vocab_size=self.lora_extra_vocab_size,
|
||||
lora_dtype=self.lora_dtype,
|
||||
max_cpu_loras=self.max_cpu_loras if self.max_cpu_loras and self.max_cpu_loras > 0 else
|
||||
None) if self.enable_lora else None
|
||||
return (model_config, cache_config, parallel_config, scheduler_config, device_config, lora_config)
|
||||
|
||||
|
||||
@dataclass
|
||||
class AsyncEngineArgs(EngineArgs):
|
||||
"""Arguments for asynchronous vLLM engine."""
|
||||
engine_use_ray: bool = False
|
||||
disable_log_requests: bool = False
|
||||
max_log_len: Optional[int] = None
|
||||
|
||||
@staticmethod
|
||||
def add_cli_args(parser: argparse.ArgumentParser) -> argparse.ArgumentParser:
|
||||
parser = EngineArgs.add_cli_args(parser)
|
||||
parser.add_argument('--engine-use-ray',
|
||||
action='store_true',
|
||||
help='use Ray to start the LLM engine in a '
|
||||
'separate process as the server process.')
|
||||
parser.add_argument('--disable-log-requests', action='store_true', help='disable logging requests')
|
||||
parser.add_argument('--max-log-len',
|
||||
type=int,
|
||||
default=None,
|
||||
help='max number of prompt characters or prompt '
|
||||
'ID numbers being printed in log. '
|
||||
'Default: unlimited.')
|
||||
return parser
|
||||
577
verl/third_party/vllm/vllm_v_0_3_1/config.py
vendored
Normal file
577
verl/third_party/vllm/vllm_v_0_3_1/config.py
vendored
Normal file
@@ -0,0 +1,577 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/config.py
|
||||
|
||||
from typing import Optional, Union, ClassVar
|
||||
from dataclasses import dataclass
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
from packaging.version import Version
|
||||
|
||||
from vllm.logger import init_logger
|
||||
from vllm.transformers_utils.config import get_config
|
||||
from vllm.utils import get_cpu_memory, is_hip, get_nvcc_cuda_version
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
_GB = 1 << 30
|
||||
|
||||
|
||||
class ModelConfig:
|
||||
"""Configuration for the model.
|
||||
|
||||
Args:
|
||||
model: Name or path of the huggingface model to use.
|
||||
tokenizer: Name or path of the huggingface tokenizer to use.
|
||||
tokenizer_mode: Tokenizer mode. "auto" will use the fast tokenizer if
|
||||
available, and "slow" will always use the slow tokenizer.
|
||||
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
|
||||
downloading the model and tokenizer.
|
||||
download_dir: Directory to download and load the weights, default to the
|
||||
default cache directory of huggingface.
|
||||
load_format: The format of the model weights to load:
|
||||
"auto" will try to load the weights in the safetensors format and
|
||||
fall back to the pytorch bin format if safetensors format is
|
||||
not available.
|
||||
"pt" will load the weights in the pytorch bin format.
|
||||
"safetensors" will load the weights in the safetensors format.
|
||||
"npcache" will load the weights in pytorch format and store
|
||||
a numpy cache to speed up the loading.
|
||||
"dummy" will initialize the weights with random values, which is
|
||||
mainly for profiling.
|
||||
dtype: Data type for model weights and activations. The "auto" option
|
||||
will use FP16 precision for FP32 and FP16 models, and BF16 precision
|
||||
for BF16 models.
|
||||
seed: Random seed for reproducibility.
|
||||
revision: The specific model version to use. It can be a branch name,
|
||||
a tag name, or a commit id. If unspecified, will use the default
|
||||
version.
|
||||
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||
branch name, a tag name, or a commit id. If unspecified, will use
|
||||
the default version.
|
||||
max_model_len: Maximum length of a sequence (including prompt and
|
||||
output). If None, will be derived from the model.
|
||||
quantization: Quantization method that was used to quantize the model
|
||||
weights. If None, we assume the model weights are not quantized.
|
||||
enforce_eager: Whether to enforce eager execution. If True, we will
|
||||
disable CUDA graph and always execute the model in eager mode.
|
||||
If False, we will use CUDA graph and eager execution in hybrid.
|
||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
||||
When a sequence has context length larger than this, we fall back
|
||||
to eager mode.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
hf_config: PretrainedConfig,
|
||||
dtype: str,
|
||||
seed: int,
|
||||
load_format: str = 'model',
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
max_model_len: Optional[int] = None,
|
||||
quantization: Optional[str] = None,
|
||||
trust_remote_code: Optional[bool] = True,
|
||||
enforce_eager: bool = False,
|
||||
max_context_len_to_capture: Optional[int] = None,
|
||||
) -> None:
|
||||
self.model = hf_config._name_or_path
|
||||
self.tokenizer = hf_config._name_or_path
|
||||
self.load_format = load_format
|
||||
self.seed = seed
|
||||
self.revision = revision
|
||||
self.tokenizer_revision = tokenizer_revision
|
||||
self.quantization = quantization
|
||||
self.trust_remote_code = trust_remote_code
|
||||
self.enforce_eager = enforce_eager
|
||||
self.max_context_len_to_capture = max_context_len_to_capture
|
||||
|
||||
# self.hf_config = get_config(model, trust_remote_code, revision)
|
||||
self.hf_config = hf_config
|
||||
self.dtype = _get_and_verify_dtype(self.hf_config, dtype)
|
||||
self.max_model_len = _get_and_verify_max_len(self.hf_config, max_model_len)
|
||||
# self._verify_load_format()
|
||||
# self._verify_tokenizer_mode()
|
||||
self._verify_quantization()
|
||||
self._verify_cuda_graph()
|
||||
|
||||
def _verify_load_format(self) -> None:
|
||||
load_format = self.load_format.lower()
|
||||
if load_format not in ["auto", "pt", "safetensors", "npcache", "dummy", "model"]:
|
||||
raise ValueError(f"Unknown load format: {self.load_format}. Must be one of "
|
||||
"'auto', 'pt', 'safetensors', 'npcache', 'dummy' or 'model'.")
|
||||
self.load_format = load_format
|
||||
|
||||
# def _verify_tokenizer_mode(self) -> None:
|
||||
# tokenizer_mode = self.tokenizer_mode.lower()
|
||||
# if tokenizer_mode not in ["auto", "slow"]:
|
||||
# raise ValueError(
|
||||
# f"Unknown tokenizer mode: {self.tokenizer_mode}. Must be "
|
||||
# "either 'auto' or 'slow'.")
|
||||
# self.tokenizer_mode = tokenizer_mode
|
||||
|
||||
def _verify_quantization(self) -> None:
|
||||
supported_quantization = ["awq", "gptq", "squeezellm"]
|
||||
rocm_not_supported_quantization = ["awq", "gptq"]
|
||||
if self.quantization is not None:
|
||||
self.quantization = self.quantization.lower()
|
||||
|
||||
# Parse quantization method from the HF model config, if available.
|
||||
hf_quant_config = getattr(self.hf_config, "quantization_config", None)
|
||||
if hf_quant_config is not None:
|
||||
hf_quant_method = str(hf_quant_config["quant_method"]).lower()
|
||||
if self.quantization is None:
|
||||
self.quantization = hf_quant_method
|
||||
elif self.quantization != hf_quant_method:
|
||||
raise ValueError("Quantization method specified in the model config "
|
||||
f"({hf_quant_method}) does not match the quantization "
|
||||
f"method specified in the `quantization` argument "
|
||||
f"({self.quantization}).")
|
||||
|
||||
if self.quantization is not None:
|
||||
if self.quantization not in supported_quantization:
|
||||
raise ValueError(f"Unknown quantization method: {self.quantization}. Must "
|
||||
f"be one of {supported_quantization}.")
|
||||
if is_hip() and self.quantization in rocm_not_supported_quantization:
|
||||
raise ValueError(f"{self.quantization} quantization is currently not supported "
|
||||
f"in ROCm.")
|
||||
logger.warning(f"{self.quantization} quantization is not fully "
|
||||
"optimized yet. The speed can be slower than "
|
||||
"non-quantized models.")
|
||||
|
||||
def _verify_cuda_graph(self) -> None:
|
||||
if self.max_context_len_to_capture is None:
|
||||
self.max_context_len_to_capture = self.max_model_len
|
||||
self.max_context_len_to_capture = min(self.max_context_len_to_capture, self.max_model_len)
|
||||
if (self.quantization in ["gptq", "squeezellm"] and not self.enforce_eager):
|
||||
# Related issue: https://github.com/vllm-project/vllm/issues/2147
|
||||
logger.warning(f"{self.quantization} does not support CUDA graph "
|
||||
"yet. Disabling CUDA graph.")
|
||||
self.enforce_eager = True
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: "ParallelConfig",
|
||||
) -> None:
|
||||
total_num_attention_heads = self.hf_config.num_attention_heads
|
||||
tensor_parallel_size = parallel_config.tensor_parallel_size
|
||||
if total_num_attention_heads % tensor_parallel_size != 0:
|
||||
raise ValueError(f"Total number of attention heads ({total_num_attention_heads})"
|
||||
" must be divisible by tensor parallel size "
|
||||
f"({tensor_parallel_size}).")
|
||||
|
||||
total_num_hidden_layers = self.hf_config.num_hidden_layers
|
||||
pipeline_parallel_size = parallel_config.pipeline_parallel_size
|
||||
if total_num_hidden_layers % pipeline_parallel_size != 0:
|
||||
raise ValueError(f"Total number of hidden layers ({total_num_hidden_layers}) "
|
||||
"must be divisible by pipeline parallel size "
|
||||
f"({pipeline_parallel_size}).")
|
||||
|
||||
def get_sliding_window(self) -> Optional[int]:
|
||||
return getattr(self.hf_config, "sliding_window", None)
|
||||
|
||||
def get_vocab_size(self) -> int:
|
||||
return self.hf_config.vocab_size
|
||||
|
||||
def get_hidden_size(self) -> int:
|
||||
return self.hf_config.hidden_size
|
||||
|
||||
def get_head_size(self) -> int:
|
||||
# FIXME(woosuk): This may not be true for all models.
|
||||
return self.hf_config.hidden_size // self.hf_config.num_attention_heads
|
||||
|
||||
def get_total_num_kv_heads(self) -> int:
|
||||
"""Returns the total number of KV heads."""
|
||||
# For GPTBigCode & Falcon:
|
||||
# NOTE: for falcon, when new_decoder_architecture is True, the
|
||||
# multi_query flag is ignored and we use n_head_kv for the number of
|
||||
# KV heads.
|
||||
falcon_model_types = ["falcon", "RefinedWeb", "RefinedWebModel"]
|
||||
new_decoder_arch_falcon = (self.hf_config.model_type in falcon_model_types and
|
||||
getattr(self.hf_config, "new_decoder_architecture", False))
|
||||
if not new_decoder_arch_falcon and getattr(self.hf_config, "multi_query", False):
|
||||
# Multi-query attention, only one KV head.
|
||||
# Currently, tensor parallelism is not supported in this case.
|
||||
return 1
|
||||
|
||||
attributes = [
|
||||
# For Falcon:
|
||||
"n_head_kv",
|
||||
"num_kv_heads",
|
||||
# For LLaMA-2:
|
||||
"num_key_value_heads",
|
||||
# For ChatGLM:
|
||||
"multi_query_group_num",
|
||||
]
|
||||
for attr in attributes:
|
||||
num_kv_heads = getattr(self.hf_config, attr, None)
|
||||
if num_kv_heads is not None:
|
||||
return num_kv_heads
|
||||
|
||||
# For non-grouped-query attention models, the number of KV heads is
|
||||
# equal to the number of attention heads.
|
||||
return self.hf_config.num_attention_heads
|
||||
|
||||
def get_num_kv_heads(self, parallel_config: "ParallelConfig") -> int:
|
||||
"""Returns the number of KV heads per GPU."""
|
||||
total_num_kv_heads = self.get_total_num_kv_heads()
|
||||
# If tensor parallelism is used, we divide the number of KV heads by
|
||||
# the tensor parallel size. We will replicate the KV heads in the
|
||||
# case where the number of KV heads is smaller than the tensor
|
||||
# parallel size so each GPU has at least one KV head.
|
||||
return max(1, total_num_kv_heads // parallel_config.tensor_parallel_size)
|
||||
|
||||
def get_num_layers(self, parallel_config: "ParallelConfig") -> int:
|
||||
total_num_hidden_layers = self.hf_config.num_hidden_layers
|
||||
return total_num_hidden_layers // parallel_config.pipeline_parallel_size
|
||||
|
||||
|
||||
class CacheConfig:
|
||||
"""Configuration for the KV cache.
|
||||
|
||||
Args:
|
||||
block_size: Size of a cache block in number of tokens.
|
||||
gpu_memory_utilization: Fraction of GPU memory to use for the
|
||||
vLLM execution.
|
||||
swap_space: Size of the CPU swap space per GPU (in GiB).
|
||||
cache_dtype: Data type for kv cache storage.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
block_size: int,
|
||||
gpu_memory_utilization: float,
|
||||
swap_space: int,
|
||||
cache_dtype: str,
|
||||
sliding_window: Optional[int] = None,
|
||||
) -> None:
|
||||
self.block_size = block_size
|
||||
self.gpu_memory_utilization = gpu_memory_utilization
|
||||
self.swap_space_bytes = swap_space * _GB
|
||||
self.cache_dtype = cache_dtype
|
||||
self.sliding_window = sliding_window
|
||||
self._verify_args()
|
||||
self._verify_cache_dtype()
|
||||
|
||||
# Will be set after profiling.
|
||||
self.num_gpu_blocks = None
|
||||
self.num_cpu_blocks = None
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
if self.gpu_memory_utilization > 1.0:
|
||||
raise ValueError("GPU memory utilization must be less than 1.0. Got "
|
||||
f"{self.gpu_memory_utilization}.")
|
||||
|
||||
def _verify_cache_dtype(self) -> None:
|
||||
if self.cache_dtype == "auto":
|
||||
pass
|
||||
elif self.cache_dtype == "fp8_e5m2":
|
||||
nvcc_cuda_version = get_nvcc_cuda_version()
|
||||
if nvcc_cuda_version < Version("11.8"):
|
||||
raise ValueError("FP8 is not supported when cuda version is lower than 11.8.")
|
||||
device_name = torch.cuda.get_device_name()
|
||||
if "AMD" in device_name:
|
||||
raise NotImplementedError("FP8_E5M2 KV Cache on AMD GPU has not been supported yet.")
|
||||
logger.info("Using fp8_e5m2 data type to store kv cache. It reduces "
|
||||
"the GPU memory footprint and boosts the performance. "
|
||||
"But it may cause slight accuracy drop. "
|
||||
"Currently we only support fp8 without scaling factors and "
|
||||
"make e5m2 as a default format.")
|
||||
else:
|
||||
raise ValueError(f"Unknown kv cache dtype: {self.cache_dtype}")
|
||||
|
||||
def verify_with_parallel_config(
|
||||
self,
|
||||
parallel_config: "ParallelConfig",
|
||||
) -> None:
|
||||
total_cpu_memory = get_cpu_memory()
|
||||
# FIXME(woosuk): Here, it is assumed that the GPUs in a tensor parallel
|
||||
# group are in the same node. However, the GPUs may span multiple nodes.
|
||||
num_gpus_per_node = parallel_config.tensor_parallel_size
|
||||
cpu_memory_usage = self.swap_space_bytes * num_gpus_per_node
|
||||
|
||||
msg = (f"{cpu_memory_usage / _GB:.2f} GiB out of "
|
||||
f"the {total_cpu_memory / _GB:.2f} GiB total CPU memory is "
|
||||
"allocated for the swap space.")
|
||||
if cpu_memory_usage > 0.7 * total_cpu_memory:
|
||||
raise ValueError("Too large swap space. " + msg)
|
||||
elif cpu_memory_usage > 0.4 * total_cpu_memory:
|
||||
logger.warning("Possibly too large swap space. " + msg)
|
||||
|
||||
|
||||
class ParallelConfig:
|
||||
"""Configuration for the distributed execution.
|
||||
|
||||
Args:
|
||||
pipeline_parallel_size: Number of pipeline parallel groups.
|
||||
tensor_parallel_size: Number of tensor parallel groups.
|
||||
worker_use_ray: Whether to use Ray for model workers. Will be set to
|
||||
True if either pipeline_parallel_size or tensor_parallel_size is
|
||||
greater than 1.
|
||||
max_parallel_loading_workers: Maximum number of multiple batches
|
||||
when load model sequentially. To avoid RAM OOM when using tensor
|
||||
parallel and large models.
|
||||
disable_custom_all_reduce: Disable the custom all-reduce kernel and
|
||||
fall back to NCCL.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pipeline_parallel_size: int,
|
||||
tensor_parallel_size: int,
|
||||
worker_use_ray: bool,
|
||||
max_parallel_loading_workers: Optional[int] = None,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
) -> None:
|
||||
self.pipeline_parallel_size = pipeline_parallel_size
|
||||
self.tensor_parallel_size = tensor_parallel_size
|
||||
self.worker_use_ray = worker_use_ray
|
||||
self.max_parallel_loading_workers = max_parallel_loading_workers
|
||||
self.disable_custom_all_reduce = disable_custom_all_reduce
|
||||
|
||||
self.world_size = pipeline_parallel_size * tensor_parallel_size
|
||||
if self.world_size > 1:
|
||||
self.worker_use_ray = True
|
||||
self._verify_args()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
if self.pipeline_parallel_size > 1:
|
||||
raise NotImplementedError("Pipeline parallelism is not supported yet.")
|
||||
if not self.disable_custom_all_reduce and self.world_size > 1:
|
||||
if is_hip():
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.info("Disabled the custom all-reduce kernel because it is not "
|
||||
"supported on AMD GPUs.")
|
||||
elif self.pipeline_parallel_size > 1:
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.info("Disabled the custom all-reduce kernel because it is not "
|
||||
"supported with pipeline parallelism.")
|
||||
|
||||
# FIXME(woosuk): Fix the stability issues and re-enable the custom
|
||||
# all-reduce kernel.
|
||||
if not self.disable_custom_all_reduce and self.world_size > 1:
|
||||
self.disable_custom_all_reduce = True
|
||||
logger.info("Custom all-reduce kernels are temporarily disabled due to "
|
||||
"stability issues. We will re-enable them once the issues are "
|
||||
"resolved.")
|
||||
|
||||
|
||||
class SchedulerConfig:
|
||||
"""Scheduler configuration.
|
||||
|
||||
Args:
|
||||
max_num_batched_tokens: Maximum number of tokens to be processed in
|
||||
a single iteration.
|
||||
max_num_seqs: Maximum number of sequences to be processed in a single
|
||||
iteration.
|
||||
max_model_len: Maximum length of a sequence (including prompt
|
||||
and generated text).
|
||||
max_paddings: Maximum number of paddings to be added to a batch.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
max_num_batched_tokens: Optional[int],
|
||||
max_num_seqs: int,
|
||||
max_model_len: int,
|
||||
max_paddings: int,
|
||||
) -> None:
|
||||
if max_num_batched_tokens is not None:
|
||||
self.max_num_batched_tokens = max_num_batched_tokens
|
||||
else:
|
||||
# If max_model_len is too short, use 2048 as the default value for
|
||||
# higher throughput.
|
||||
self.max_num_batched_tokens = max(max_model_len, 2048)
|
||||
self.max_num_seqs = max_num_seqs
|
||||
self.max_model_len = max_model_len
|
||||
self.max_paddings = max_paddings
|
||||
self._verify_args()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
if self.max_num_batched_tokens < self.max_model_len:
|
||||
raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) is "
|
||||
f"smaller than max_model_len ({self.max_model_len}). "
|
||||
"This effectively limits the maximum sequence length to "
|
||||
"max_num_batched_tokens and makes vLLM reject longer "
|
||||
"sequences. Please increase max_num_batched_tokens or "
|
||||
"decrease max_model_len.")
|
||||
if self.max_num_batched_tokens < self.max_num_seqs:
|
||||
raise ValueError(f"max_num_batched_tokens ({self.max_num_batched_tokens}) must "
|
||||
"be greater than or equal to max_num_seqs "
|
||||
f"({self.max_num_seqs}).")
|
||||
|
||||
|
||||
class DeviceConfig:
|
||||
|
||||
def __init__(self, device: str = "cuda") -> None:
|
||||
self.device = torch.device(device)
|
||||
|
||||
|
||||
@dataclass
|
||||
class LoRAConfig:
|
||||
max_lora_rank: int
|
||||
max_loras: int
|
||||
max_cpu_loras: Optional[int] = None
|
||||
lora_dtype: Optional[torch.dtype] = None
|
||||
lora_extra_vocab_size: int = 256
|
||||
# This is a constant.
|
||||
lora_vocab_padding_size: ClassVar[int] = 256
|
||||
|
||||
def __post_init__(self):
|
||||
# Keep this in sync with csrc/punica/bgmv/bgmv_config.h
|
||||
possible_max_ranks = (8, 16, 32, 64)
|
||||
possible_lora_extra_vocab_size = (0, 256, 512)
|
||||
if self.max_lora_rank not in possible_max_ranks:
|
||||
raise ValueError(f"max_lora_rank ({self.max_lora_rank}) must be one of "
|
||||
f"{possible_max_ranks}.")
|
||||
if self.lora_extra_vocab_size not in possible_lora_extra_vocab_size:
|
||||
raise ValueError(f"lora_extra_vocab_size ({self.lora_extra_vocab_size}) "
|
||||
f"must be one of {possible_lora_extra_vocab_size}.")
|
||||
if self.max_loras < 1:
|
||||
raise ValueError(f"max_loras ({self.max_loras}) must be >= 1.")
|
||||
if self.max_cpu_loras is None:
|
||||
self.max_cpu_loras = self.max_loras
|
||||
elif self.max_cpu_loras < self.max_loras:
|
||||
raise ValueError(f"max_cpu_loras ({self.max_cpu_loras}) must be >= "
|
||||
f"max_loras ({self.max_loras})")
|
||||
|
||||
def verify_with_model_config(self, model_config: ModelConfig):
|
||||
if self.lora_dtype in (None, "auto"):
|
||||
self.lora_dtype = model_config.dtype
|
||||
elif isinstance(self.lora_dtype, str):
|
||||
self.lora_dtype = getattr(torch, self.lora_dtype)
|
||||
if model_config.quantization is not None:
|
||||
raise ValueError("LoRA is not supported with quantized models yet.")
|
||||
|
||||
def verify_with_scheduler_config(self, scheduler_config: SchedulerConfig):
|
||||
if scheduler_config.max_num_batched_tokens > 65528:
|
||||
raise ValueError("Due to limitations of the custom LoRA CUDA kernel, "
|
||||
"max_num_batched_tokens must be <= 65528 when "
|
||||
"LoRA is enabled.")
|
||||
|
||||
|
||||
_STR_DTYPE_TO_TORCH_DTYPE = {
|
||||
"half": torch.float16,
|
||||
"float16": torch.float16,
|
||||
"float": torch.float32,
|
||||
"float32": torch.float32,
|
||||
"bfloat16": torch.bfloat16,
|
||||
}
|
||||
|
||||
_ROCM_NOT_SUPPORTED_DTYPE = ["float", "float32"]
|
||||
|
||||
|
||||
def _get_and_verify_dtype(
|
||||
config: PretrainedConfig,
|
||||
dtype: Union[str, torch.dtype],
|
||||
) -> torch.dtype:
|
||||
# NOTE: getattr(config, "torch_dtype", torch.float32) is not correct
|
||||
# because config.torch_dtype can be None.
|
||||
config_dtype = getattr(config, "torch_dtype", None)
|
||||
if config_dtype is None:
|
||||
config_dtype = torch.float32
|
||||
|
||||
if isinstance(dtype, str):
|
||||
dtype = dtype.lower()
|
||||
if dtype == "auto":
|
||||
if config_dtype == torch.float32:
|
||||
# Following the common practice, we use float16 for float32
|
||||
# models.
|
||||
torch_dtype = torch.float16
|
||||
else:
|
||||
torch_dtype = config_dtype
|
||||
else:
|
||||
if dtype not in _STR_DTYPE_TO_TORCH_DTYPE:
|
||||
raise ValueError(f"Unknown dtype: {dtype}")
|
||||
torch_dtype = _STR_DTYPE_TO_TORCH_DTYPE[dtype]
|
||||
elif isinstance(dtype, torch.dtype):
|
||||
torch_dtype = dtype
|
||||
else:
|
||||
raise ValueError(f"Unknown dtype: {dtype}")
|
||||
|
||||
if is_hip() and torch_dtype == torch.float32:
|
||||
rocm_supported_dtypes = [
|
||||
k for k, v in _STR_DTYPE_TO_TORCH_DTYPE.items() if (k not in _ROCM_NOT_SUPPORTED_DTYPE)
|
||||
]
|
||||
raise ValueError(f"dtype \'{dtype}\' is not supported in ROCm. "
|
||||
f"Supported dtypes are {rocm_supported_dtypes}")
|
||||
|
||||
# Verify the dtype.
|
||||
if torch_dtype != config_dtype:
|
||||
if torch_dtype == torch.float32:
|
||||
# Upcasting to float32 is allowed.
|
||||
pass
|
||||
elif config_dtype == torch.float32:
|
||||
# Downcasting from float32 to float16 or bfloat16 is allowed.
|
||||
pass
|
||||
else:
|
||||
# Casting between float16 and bfloat16 is allowed with a warning.
|
||||
logger.warning(f"Casting {config_dtype} to {torch_dtype}.")
|
||||
|
||||
return torch_dtype
|
||||
|
||||
|
||||
def _get_and_verify_max_len(
|
||||
hf_config: PretrainedConfig,
|
||||
max_model_len: Optional[int],
|
||||
) -> int:
|
||||
"""Get and verify the model's maximum length."""
|
||||
derived_max_model_len = float("inf")
|
||||
possible_keys = [
|
||||
# OPT
|
||||
"max_position_embeddings",
|
||||
# GPT-2
|
||||
"n_positions",
|
||||
# MPT
|
||||
"max_seq_len",
|
||||
# ChatGLM2
|
||||
"seq_length",
|
||||
# Others
|
||||
"max_sequence_length",
|
||||
"max_seq_length",
|
||||
"seq_len",
|
||||
]
|
||||
for key in possible_keys:
|
||||
max_len_key = getattr(hf_config, key, None)
|
||||
if max_len_key is not None:
|
||||
derived_max_model_len = min(derived_max_model_len, max_len_key)
|
||||
if derived_max_model_len == float("inf"):
|
||||
if max_model_len is not None:
|
||||
# If max_model_len is specified, we use it.
|
||||
return max_model_len
|
||||
|
||||
default_max_len = 2048
|
||||
logger.warning("The model's config.json does not contain any of the following "
|
||||
"keys to determine the original maximum length of the model: "
|
||||
f"{possible_keys}. Assuming the model's maximum length is "
|
||||
f"{default_max_len}.")
|
||||
derived_max_model_len = default_max_len
|
||||
|
||||
rope_scaling = getattr(hf_config, "rope_scaling", None)
|
||||
if rope_scaling is not None:
|
||||
assert "factor" in rope_scaling
|
||||
scaling_factor = rope_scaling["factor"]
|
||||
if rope_scaling["type"] == "yarn":
|
||||
derived_max_model_len = rope_scaling["original_max_position_embeddings"]
|
||||
derived_max_model_len *= scaling_factor
|
||||
|
||||
if max_model_len is None:
|
||||
max_model_len = derived_max_model_len
|
||||
elif max_model_len > derived_max_model_len:
|
||||
raise ValueError(f"User-specified max_model_len ({max_model_len}) is greater than "
|
||||
f"the derived max_model_len ({max_len_key}={derived_max_model_len}"
|
||||
" in model's config.json). This may lead to incorrect model "
|
||||
"outputs or CUDA errors. Make sure the value is correct and "
|
||||
"within the model context size.")
|
||||
return int(max_model_len)
|
||||
275
verl/third_party/vllm/vllm_v_0_3_1/llm.py
vendored
Normal file
275
verl/third_party/vllm/vllm_v_0_3_1/llm.py
vendored
Normal file
@@ -0,0 +1,275 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/entrypoints/llm.py
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
from tqdm import tqdm
|
||||
from transformers import PreTrainedTokenizer, PreTrainedTokenizerFast
|
||||
from transformers import PretrainedConfig
|
||||
import torch.nn as nn
|
||||
from .arg_utils import EngineArgs
|
||||
from .llm_engine_sp import LLMEngine
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import Counter
|
||||
import torch
|
||||
from torch.nn.utils.rnn import pad_sequence
|
||||
from verl.workers.rollout.tokenizer import HybridEngineBaseTokenizer
|
||||
|
||||
|
||||
class LLM:
|
||||
"""An LLM for generating texts from given prompts and sampling parameters.
|
||||
|
||||
This class includes a tokenizer, a language model (possibly distributed
|
||||
across multiple GPUs), and GPU memory space allocated for intermediate
|
||||
states (aka KV cache). Given a batch of prompts and sampling parameters,
|
||||
this class generates texts from the model, using an intelligent batching
|
||||
mechanism and efficient memory management.
|
||||
|
||||
NOTE: This class is intended to be used for offline inference. For online
|
||||
serving, use the `AsyncLLMEngine` class instead.
|
||||
NOTE: For the comprehensive list of arguments, see `EngineArgs`.
|
||||
|
||||
Args:
|
||||
model: A HuggingFace Transformers model instance.
|
||||
tokenizer: A HuggingFace Transformers tokenizer instance.
|
||||
tokenizer_mode: The tokenizer mode. "auto" will use the fast tokenizer
|
||||
if available, and "slow" will always use the slow tokenizer.
|
||||
trust_remote_code: Trust remote code (e.g., from HuggingFace) when
|
||||
downloading the model and tokenizer.
|
||||
tensor_parallel_size: The number of GPUs to use for distributed
|
||||
execution with tensor parallelism.
|
||||
dtype: The data type for the model weights and activations. Currently,
|
||||
we support `float32`, `float16`, and `bfloat16`. If `auto`, we use
|
||||
the `torch_dtype` attribute specified in the model config file.
|
||||
However, if the `torch_dtype` in the config is `float32`, we will
|
||||
use `float16` instead.
|
||||
quantization: The method used to quantize the model weights. Currently,
|
||||
we support "awq". If None, we assume the model weights are not
|
||||
quantized and use `dtype` to determine the data type of the weights.
|
||||
revision: The specific model version to use. It can be a branch name,
|
||||
a tag name, or a commit id.
|
||||
tokenizer_revision: The specific tokenizer version to use. It can be a
|
||||
branch name, a tag name, or a commit id.
|
||||
seed: The seed to initialize the random number generator for sampling.
|
||||
gpu_memory_utilization: The ratio (between 0 and 1) of GPU memory to
|
||||
reserve for the model weights, activations, and KV cache. Higher
|
||||
values will increase the KV cache size and thus improve the model's
|
||||
throughput. However, if the value is too high, it may cause out-of-
|
||||
memory (OOM) errors.
|
||||
swap_space: The size (GiB) of CPU memory per GPU to use as swap space.
|
||||
This can be used for temporarily storing the states of the requests
|
||||
when their `best_of` sampling parameters are larger than 1. If all
|
||||
requests will have `best_of=1`, you can safely set this to 0.
|
||||
Otherwise, too small values may cause out-of-memory (OOM) errors.
|
||||
enforce_eager: Whether to enforce eager execution. If True, we will
|
||||
disable CUDA graph and always execute the model in eager mode.
|
||||
If False, we will use CUDA graph and eager execution in hybrid.
|
||||
max_context_len_to_capture: Maximum context len covered by CUDA graphs.
|
||||
When a sequence has context length larger than this, we fall back
|
||||
to eager mode.
|
||||
disable_custom_all_reduce: See ParallelConfig
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer],
|
||||
model_hf_config: PretrainedConfig,
|
||||
tokenizer_mode: str = "auto",
|
||||
trust_remote_code: bool = False,
|
||||
tensor_parallel_size: int = 1,
|
||||
dtype: str = "auto",
|
||||
quantization: Optional[str] = None,
|
||||
revision: Optional[str] = None,
|
||||
tokenizer_revision: Optional[str] = None,
|
||||
seed: int = 0,
|
||||
gpu_memory_utilization: float = 0.9,
|
||||
swap_space: int = 4,
|
||||
enforce_eager: bool = False,
|
||||
max_context_len_to_capture: int = 8192,
|
||||
disable_custom_all_reduce: bool = False,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
if "disable_log_stats" not in kwargs:
|
||||
kwargs["disable_log_stats"] = True
|
||||
engine_args = EngineArgs(
|
||||
model_hf_config=model_hf_config,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
dtype=dtype,
|
||||
quantization=quantization,
|
||||
revision=revision,
|
||||
tokenizer_revision=tokenizer_revision,
|
||||
seed=seed,
|
||||
gpu_memory_utilization=gpu_memory_utilization,
|
||||
swap_space=swap_space,
|
||||
enforce_eager=enforce_eager,
|
||||
max_context_len_to_capture=max_context_len_to_capture,
|
||||
disable_custom_all_reduce=disable_custom_all_reduce,
|
||||
**kwargs,
|
||||
)
|
||||
tokenizer_cls = (PreTrainedTokenizer, PreTrainedTokenizerFast, HybridEngineBaseTokenizer)
|
||||
if not isinstance(tokenizer, tokenizer_cls):
|
||||
raise ValueError(
|
||||
f"Unexpected tokenizer type: {type(tokenizer)}. Must be"
|
||||
"one of the following: PreTrainedTokenizer, PreTrainedTokenizerFast, verl.workers.rollout.HybridEngineBaseTokenizer"
|
||||
)
|
||||
self.llm_engine = LLMEngine.from_engine_args(model, tokenizer, engine_args)
|
||||
self.request_counter = Counter()
|
||||
|
||||
def init_cache_engine(self):
|
||||
self.llm_engine.init_cache_engine()
|
||||
|
||||
def free_cache_engine(self):
|
||||
self.llm_engine.free_cache_engine()
|
||||
|
||||
def get_tokenizer(self) -> Union[PreTrainedTokenizer, PreTrainedTokenizerFast]:
|
||||
return self.llm_engine.tokenizer
|
||||
|
||||
def set_tokenizer(
|
||||
self,
|
||||
tokenizer: Union[PreTrainedTokenizer, PreTrainedTokenizerFast],
|
||||
) -> None:
|
||||
self.llm_engine.tokenizer = tokenizer
|
||||
|
||||
def generate(
|
||||
self,
|
||||
prompts: Optional[Union[str, List[str]]] = None,
|
||||
sampling_params: Optional[SamplingParams] = None,
|
||||
prompt_token_ids: Optional[List[List[int]]] = None,
|
||||
prefix_pos: Optional[Union[int, List[int]]] = None,
|
||||
use_tqdm: bool = True,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
) -> List[RequestOutput]:
|
||||
"""Generates the completions for the input prompts.
|
||||
|
||||
NOTE: This class automatically batches the given prompts, considering
|
||||
the memory constraint. For the best performance, put all of your prompts
|
||||
into a single list and pass it to this method.
|
||||
|
||||
Args:
|
||||
prompts: A list of prompts to generate completions for.
|
||||
sampling_params: The sampling parameters for text generation. If
|
||||
None, we use the default sampling parameters.
|
||||
prompt_token_ids: A list of token IDs for the prompts. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
use_tqdm: Whether to use tqdm to display the progress bar.
|
||||
|
||||
Returns:
|
||||
A list of `RequestOutput` objects containing the generated
|
||||
completions in the same order as the input prompts.
|
||||
"""
|
||||
if prompts is None and prompt_token_ids is None:
|
||||
raise ValueError("Either prompts or prompt_token_ids must be "
|
||||
"provided.")
|
||||
if isinstance(prompts, str):
|
||||
# Convert a single prompt to a list.
|
||||
prompts = [prompts]
|
||||
if prompts is not None and prompt_token_ids is not None:
|
||||
if len(prompts) != len(prompt_token_ids):
|
||||
raise ValueError("The lengths of prompts and prompt_token_ids "
|
||||
"must be the same.")
|
||||
if sampling_params is None:
|
||||
# Use default sampling params.
|
||||
sampling_params = SamplingParams()
|
||||
|
||||
# Add requests to the engine.
|
||||
num_requests = len(prompts) if prompts is not None else len(prompt_token_ids)
|
||||
for i in range(num_requests):
|
||||
prompt = prompts[i] if prompts is not None else None
|
||||
prefix_pos_i = prefix_pos[i] if prefix_pos is not None else None
|
||||
token_ids = None if prompt_token_ids is None else prompt_token_ids[i]
|
||||
if not isinstance(token_ids, list):
|
||||
# NOTE(shengguangming): convert the rollout input into List[str]
|
||||
token_ids = self._pre_process_inputs(token_ids)
|
||||
self._add_request(prompt, sampling_params, token_ids, lora_request=lora_request, prefix_pos=prefix_pos_i)
|
||||
return self._run_engine(use_tqdm)
|
||||
|
||||
def _add_request(
|
||||
self,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]],
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prefix_pos: Optional[int] = None,
|
||||
) -> None:
|
||||
request_id = str(next(self.request_counter))
|
||||
self.llm_engine.add_request(request_id,
|
||||
prompt,
|
||||
sampling_params,
|
||||
prompt_token_ids,
|
||||
lora_request=lora_request,
|
||||
prefix_pos=prefix_pos)
|
||||
|
||||
def _run_engine(self, use_tqdm: bool) -> List[RequestOutput]:
|
||||
# Initialize tqdm.
|
||||
if use_tqdm:
|
||||
num_requests = self.llm_engine.get_num_unfinished_requests()
|
||||
pbar = tqdm(total=num_requests, desc="Processed prompts")
|
||||
# Run the engine.
|
||||
outputs: List[RequestOutput] = []
|
||||
while self.llm_engine.has_unfinished_requests():
|
||||
step_outputs = self.llm_engine.step()
|
||||
for output in step_outputs:
|
||||
if output.finished:
|
||||
outputs.append(output)
|
||||
if use_tqdm:
|
||||
pbar.update(1)
|
||||
if use_tqdm:
|
||||
pbar.close()
|
||||
# Sort the outputs by request ID.
|
||||
# This is necessary because some requests may be finished earlier than
|
||||
# its previous requests.
|
||||
outputs = sorted(outputs, key=lambda x: int(x.request_id))
|
||||
# TODO(shengguangming): maybe we can hack the autoregressive logics without only apply post process for better performance
|
||||
return self._post_process_outputs(outputs)
|
||||
|
||||
# NOTE(shengguangming): add for verl
|
||||
# TODO(sgm): we can optimize it by making the dataloader yield List[int] without padding.
|
||||
def _pre_process_inputs(self, prompt_token_ids: torch.Tensor) -> List[int]:
|
||||
# remove the left padding in the prompt token_id
|
||||
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
|
||||
non_pad_index = torch.nonzero(prompt_token_ids != pad_token_id, as_tuple=False)[0][0]
|
||||
token_ids = prompt_token_ids[non_pad_index:].tolist()
|
||||
return token_ids
|
||||
|
||||
# NOTE(shengguangming): add for verl
|
||||
def _post_process_outputs(self, outputs: List[RequestOutput]) -> Tuple[torch.Tensor, torch.Tensor]:
|
||||
output_token_ids = []
|
||||
logprobs = []
|
||||
for output in outputs: # List[RequestOutput]
|
||||
output = output.outputs
|
||||
for output in output: # List[CompletionOutput], usually len == 1
|
||||
output_token_ids.append(torch.tensor(output.token_ids))
|
||||
# TODO(shengguangming): can be optimzied by rewrite the Sampler._get_logprobs() logits
|
||||
logprobs_dicts = output.logprobs
|
||||
if logprobs_dicts is not None:
|
||||
logprob = []
|
||||
for logprobs_dict, id in zip(logprobs_dicts, output.token_ids):
|
||||
logprob.append(logprobs_dict[id])
|
||||
logprobs.append(torch.tensor(logprob))
|
||||
|
||||
pad_token_id = self.llm_engine.tokenizer.pad_token_id if self.llm_engine.tokenizer.pad_token_id is not None else self.llm_engine.tokenizer.eos_token_id
|
||||
output_token_ids = pad_sequence(output_token_ids, batch_first=True, padding_value=pad_token_id)
|
||||
if len(logprobs) > 0:
|
||||
logprobs = pad_sequence(logprobs, batch_first=True, padding_value=pad_token_id)
|
||||
return output_token_ids, logprobs
|
||||
|
||||
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
|
||||
self.llm_engine.sync_model_weights(actor_weights=actor_weights)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
self.llm_engine.offload_model_weights()
|
||||
765
verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
vendored
Normal file
765
verl/third_party/vllm/vllm_v_0_3_1/llm_engine_sp.py
vendored
Normal file
@@ -0,0 +1,765 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/engine/llm_engine.py
|
||||
|
||||
import os
|
||||
import socket
|
||||
import time
|
||||
import torch
|
||||
from typing import TYPE_CHECKING, Any, Dict, Iterable, List, Optional, Tuple, Union
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.core.scheduler import Scheduler, SchedulerOutputs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.sequence import (SamplerOutput, Sequence, SequenceGroup, SequenceGroupMetadata, SequenceGroupOutput,
|
||||
SequenceOutput, SequenceStatus)
|
||||
from vllm.transformers_utils.tokenizer import detokenize_incrementally
|
||||
from vllm.engine.metrics import StatLogger, Stats
|
||||
from vllm.utils import Counter
|
||||
import torch.nn as nn
|
||||
from .arg_utils import EngineArgs
|
||||
from .tokenizer import TokenizerGroup
|
||||
|
||||
logger = init_logger(__name__)
|
||||
_LOCAL_LOGGING_INTERVAL_SEC = 5
|
||||
|
||||
|
||||
class LLMEngine:
|
||||
"""An LLM engine that receives requests and generates texts.
|
||||
|
||||
This is the main class for the vLLM engine. It receives requests
|
||||
from clients and generates texts from the LLM. It includes a tokenizer, a
|
||||
language model (possibly distributed across multiple GPUs), and GPU memory
|
||||
space allocated for intermediate states (aka KV cache). This class utilizes
|
||||
iteration-level scheduling and efficient memory management to maximize the
|
||||
serving throughput.
|
||||
|
||||
The `LLM` class wraps this class for offline batched inference and the
|
||||
`AsyncLLMEngine` class wraps this class for online serving.
|
||||
|
||||
NOTE: The config arguments are derived from the `EngineArgs` class. For the
|
||||
comprehensive list of arguments, see `EngineArgs`.
|
||||
|
||||
Args:
|
||||
model_config: The configuration related to the LLM model.
|
||||
cache_config: The configuration related to the KV cache memory
|
||||
management.
|
||||
parallel_config: The configuration related to distributed execution.
|
||||
scheduler_config: The configuration related to the request scheduler.
|
||||
distributed_init_method: The initialization method for distributed
|
||||
execution. See `torch.distributed.init_process_group` for details.
|
||||
placement_group: Ray placement group for distributed execution.
|
||||
Required for distributed execution.
|
||||
log_stats: Whether to log statistics.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
tokenizer: nn.Module,
|
||||
model_config: ModelConfig,
|
||||
cache_config: CacheConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
distributed_init_method: str,
|
||||
placement_group: Optional[None],
|
||||
log_stats: bool,
|
||||
) -> None:
|
||||
logger.info("Initializing an LLM engine with config: "
|
||||
f"model={model_config.model!r}, "
|
||||
f"tokenizer={model_config.tokenizer!r}, "
|
||||
# f"tokenizer_mode={model_config.tokenizer_mode}, "
|
||||
f"revision={model_config.revision}, "
|
||||
f"tokenizer_revision={model_config.tokenizer_revision}, "
|
||||
# f"trust_remote_code={model_config.trust_remote_code}, "
|
||||
f"dtype={model_config.dtype}, "
|
||||
f"max_seq_len={model_config.max_model_len}, "
|
||||
# f"download_dir={model_config.download_dir!r}, "
|
||||
# f"load_format={model_config.load_format}, "
|
||||
f"disable_custom_all_reduce={parallel_config.disable_custom_all_reduce}, "
|
||||
f"tensor_parallel_size={parallel_config.tensor_parallel_size}, "
|
||||
f"quantization={model_config.quantization}, "
|
||||
f"seed={model_config.seed})")
|
||||
# TODO(woosuk): Print more configs in debug mode.
|
||||
|
||||
self.model_config = model_config # TODO: currently is hfconfig
|
||||
self.cache_config = cache_config
|
||||
self.lora_config = lora_config
|
||||
assert self.cache_config.sliding_window == getattr(self.model_config.hf_config, "sliding_window", None)
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.device_config = device_config
|
||||
self.log_stats = log_stats
|
||||
self._verify_args()
|
||||
|
||||
# self.model = model # should not store the model, it should be deleted
|
||||
# TODO(shengguangming): maybe we can choose init here or from arguments
|
||||
self._init_tokenizer(tokenizer)
|
||||
|
||||
self.seq_counter = Counter()
|
||||
|
||||
# Create the parallel GPU workers.
|
||||
self._init_workers_sp(model, distributed_init_method)
|
||||
|
||||
# Profile the memory usage and initialize the cache.
|
||||
self._init_cache_sp()
|
||||
|
||||
# Create the scheduler.
|
||||
# NOTE(shengguangming): each process will have independent scheduler
|
||||
self.scheduler = Scheduler(scheduler_config, cache_config, lora_config)
|
||||
|
||||
# Metric Logging.
|
||||
if self.log_stats:
|
||||
self.stat_logger = StatLogger(local_interval=_LOCAL_LOGGING_INTERVAL_SEC)
|
||||
|
||||
# Logging.
|
||||
self.last_logging_time = 0.0
|
||||
# List of (timestamp, num_tokens)
|
||||
self.num_prompt_tokens: List[Tuple[float, int]] = []
|
||||
# List of (timestamp, num_tokens)
|
||||
self.num_generation_tokens: List[Tuple[float, int]] = []
|
||||
|
||||
def _init_tokenizer(self, tokenizer, **tokenizer_init_kwargs):
|
||||
init_kwargs = dict(enable_lora=bool(self.lora_config),
|
||||
max_num_seqs=self.scheduler_config.max_num_seqs,
|
||||
max_input_length=None)
|
||||
init_kwargs.update(tokenizer_init_kwargs)
|
||||
self.tokenizer: TokenizerGroup = TokenizerGroup(tokenizer, **init_kwargs)
|
||||
|
||||
# TODO: check get_lora_tokenizer func
|
||||
def get_tokenizer_for_seq(self, sequence: Sequence):
|
||||
return self.tokenizer.get_lora_tokenizer(sequence.lora_request)
|
||||
|
||||
def _init_workers_sp(self, model, distributed_init_method: str):
|
||||
# Lazy import the Worker to avoid importing torch.cuda/xformers
|
||||
# before CUDA_VISIBLE_DEVICES is set in the Worker
|
||||
from .worker import Worker # pylint: disable=import-outside-toplevel
|
||||
|
||||
rank = int(os.getenv("RANK"))
|
||||
|
||||
self.worker = Worker(
|
||||
model,
|
||||
self.model_config,
|
||||
self.parallel_config,
|
||||
self.scheduler_config,
|
||||
self.device_config,
|
||||
rank,
|
||||
distributed_init_method,
|
||||
lora_config=self.lora_config,
|
||||
kv_cache_dtype=self.cache_config.cache_dtype,
|
||||
)
|
||||
|
||||
# NOTE(shengguangming): torch.distributed.init_process_group will be called inside the init_model()
|
||||
self.worker.init_model()
|
||||
self.worker.load_model()
|
||||
|
||||
def _verify_args(self) -> None:
|
||||
self.model_config.verify_with_parallel_config(self.parallel_config)
|
||||
self.cache_config.verify_with_parallel_config(self.parallel_config)
|
||||
|
||||
def _init_cache_sp(self) -> None:
|
||||
"""Profiles the memory usage and initializes the KV cache."""
|
||||
# Get the maximum number of blocks that can be allocated on GPU and CPU.
|
||||
num_blocks = self.worker.profile_num_available_blocks(
|
||||
block_size=self.cache_config.block_size,
|
||||
gpu_memory_utilization=self.cache_config.gpu_memory_utilization,
|
||||
cpu_swap_space=self.cache_config.swap_space_bytes,
|
||||
cache_dtype=self.cache_config.cache_dtype,
|
||||
)
|
||||
|
||||
# NOTE(shengguangming): Now we don't use a shared centralized controler but each process will
|
||||
# have its own scheduler
|
||||
num_gpu_blocks = num_blocks[0]
|
||||
num_cpu_blocks = num_blocks[1]
|
||||
|
||||
# FIXME(woosuk): Change to debug log.
|
||||
logger.info(f"# GPU blocks: {num_gpu_blocks}, "
|
||||
f"# CPU blocks: {num_cpu_blocks}")
|
||||
|
||||
if num_gpu_blocks <= 0:
|
||||
raise ValueError("No available memory for the cache blocks. "
|
||||
"Try increasing `gpu_memory_utilization` when "
|
||||
"initializing the engine.")
|
||||
|
||||
max_seq_len = self.cache_config.block_size * num_gpu_blocks
|
||||
if self.model_config.max_model_len > max_seq_len:
|
||||
raise ValueError(f"The model's max seq len ({self.model_config.max_model_len}) "
|
||||
"is larger than the maximum number of tokens that can be "
|
||||
f"stored in KV cache ({max_seq_len}). Try increasing "
|
||||
"`gpu_memory_utilization` or decreasing `max_model_len` when "
|
||||
"initializing the engine.")
|
||||
|
||||
self.cache_config.num_gpu_blocks = num_gpu_blocks
|
||||
self.cache_config.num_cpu_blocks = num_cpu_blocks
|
||||
|
||||
# Initialize the cache.
|
||||
self.worker.init_cache_engine(cache_config=self.cache_config)
|
||||
self.worker.warm_up_model()
|
||||
|
||||
def init_cache_engine(self):
|
||||
self.worker.init_cache_engine(cache_config=self.cache_config)
|
||||
|
||||
def free_cache_engine(self):
|
||||
self.worker.free_cache_engine()
|
||||
|
||||
@classmethod
|
||||
def from_engine_args(cls, model, tokenizer, engine_args: EngineArgs) -> "LLMEngine":
|
||||
"""Creates an LLM engine from the engine arguments."""
|
||||
# Create the engine configs.
|
||||
engine_configs = engine_args.create_engine_configs()
|
||||
parallel_config = engine_configs[2]
|
||||
# Initialize the cluster.
|
||||
distributed_init_method, placement_group = initialize_cluster(parallel_config)
|
||||
# Create the LLM engine.
|
||||
engine = cls(model,
|
||||
tokenizer,
|
||||
*engine_configs,
|
||||
distributed_init_method,
|
||||
placement_group,
|
||||
log_stats=not engine_args.disable_log_stats)
|
||||
return engine
|
||||
|
||||
def add_request(
|
||||
self,
|
||||
request_id: str,
|
||||
prompt: Optional[str],
|
||||
sampling_params: SamplingParams,
|
||||
prompt_token_ids: Optional[List[int]] = None,
|
||||
arrival_time: Optional[float] = None,
|
||||
lora_request: Optional[LoRARequest] = None,
|
||||
prefix_pos: Optional[int] = None,
|
||||
) -> None:
|
||||
"""Add a request to the engine's request pool.
|
||||
|
||||
The request is added to the request pool and will be processed by the
|
||||
scheduler as `engine.step()` is called. The exact scheduling policy is
|
||||
determined by the scheduler.
|
||||
|
||||
Args:
|
||||
request_id: The unique ID of the request.
|
||||
prompt: The prompt string. Can be None if prompt_token_ids is
|
||||
provided.
|
||||
sampling_params: The sampling parameters for text generation.
|
||||
prompt_token_ids: The token IDs of the prompt. If None, we
|
||||
use the tokenizer to convert the prompts to token IDs.
|
||||
arrival_time: The arrival time of the request. If None, we use
|
||||
the current monotonic time.
|
||||
prefix_pos: If not None, we use the given position as the prefix
|
||||
position for each prompt. We will cache the prefix's KV
|
||||
cache and reuse it for the next request with the same prefix.
|
||||
This is an experimental feature, and may be replaced with
|
||||
automatic prefix caching in the future.
|
||||
|
||||
Details:
|
||||
- Set arrival_time to the current time if it is None.
|
||||
- Set prompt_token_ids to the encoded prompt if it is None.
|
||||
- Create `best_of` number of :class:`~vllm.Sequence` objects.
|
||||
- Create a :class:`~vllm.SequenceGroup` object
|
||||
from the list of :class:`~vllm.Sequence`.
|
||||
- Add the :class:`~vllm.SequenceGroup` object to the scheduler.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine
|
||||
>>> engine = LLMEngine.from_engine_args(engine_args)
|
||||
>>> # set request arguments
|
||||
>>> example_prompt = "Who is the president of the United States?"
|
||||
>>> sampling_params = SamplingParams(temperature=0.0)
|
||||
>>> request_id = 0
|
||||
>>>
|
||||
>>> # add the request to the engine
|
||||
>>> engine.add_request(
|
||||
>>> str(request_id),
|
||||
>>> example_prompt,
|
||||
>>> SamplingParams(temperature=0.0))
|
||||
>>> # continue the request processing
|
||||
>>> ...
|
||||
"""
|
||||
if lora_request is not None and not self.lora_config:
|
||||
raise ValueError(f"Got lora_request {lora_request} but LoRA is "
|
||||
"not enabled!")
|
||||
if arrival_time is None:
|
||||
arrival_time = time.monotonic()
|
||||
if prompt_token_ids is None:
|
||||
assert prompt is not None
|
||||
prompt_token_ids = self.tokenizer.encode(prompt)
|
||||
|
||||
# Create the sequences.
|
||||
block_size = self.cache_config.block_size
|
||||
seq_id = next(self.seq_counter)
|
||||
seq = Sequence(seq_id, prompt, prompt_token_ids, block_size, lora_request)
|
||||
|
||||
# Check whether the input specifies prefix
|
||||
prefix = self.scheduler.prefix_pool.add_or_get_prefix(prompt_token_ids[:prefix_pos], lora_request.lora_int_id if
|
||||
lora_request else 0) if prefix_pos is not None else None
|
||||
|
||||
# Create the sequence group.
|
||||
seq_group = SequenceGroup(request_id, [seq], sampling_params, arrival_time, lora_request, prefix)
|
||||
|
||||
# Add the sequence group to the scheduler.
|
||||
self.scheduler.add_seq_group(seq_group)
|
||||
|
||||
def abort_request(self, request_id: Union[str, Iterable[str]]) -> None:
|
||||
"""Aborts a request(s) with the given ID.
|
||||
|
||||
Args:
|
||||
request_id: The ID(s) of the request to abort.
|
||||
|
||||
Details:
|
||||
- Refer to the
|
||||
:meth:`~vllm.core.scheduler.Scheduler.abort_seq_group`
|
||||
from class :class:`~vllm.core.scheduler.Scheduler`.
|
||||
|
||||
Example:
|
||||
>>> # initialize engine and add a request with request_id
|
||||
>>> request_id = str(0)
|
||||
>>> # abort the request
|
||||
>>> engine.abort_request(request_id)
|
||||
"""
|
||||
self.scheduler.abort_seq_group(request_id)
|
||||
|
||||
def get_model_config(self) -> ModelConfig:
|
||||
"""Gets the model configuration."""
|
||||
return self.model_config
|
||||
|
||||
def get_num_unfinished_requests(self) -> int:
|
||||
"""Gets the number of unfinished requests."""
|
||||
return self.scheduler.get_num_unfinished_seq_groups()
|
||||
|
||||
def has_unfinished_requests(self) -> bool:
|
||||
"""Returns True if there are unfinished requests."""
|
||||
return self.scheduler.has_unfinished_seqs()
|
||||
|
||||
def _check_beam_search_early_stopping(
|
||||
self,
|
||||
early_stopping: Union[bool, str],
|
||||
sampling_params: SamplingParams,
|
||||
best_running_seq: Sequence,
|
||||
current_worst_seq: Sequence,
|
||||
) -> bool:
|
||||
assert sampling_params.use_beam_search
|
||||
length_penalty = sampling_params.length_penalty
|
||||
if early_stopping is True:
|
||||
return True
|
||||
|
||||
current_worst_score = (current_worst_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(current_worst_seq).eos_token_id))
|
||||
if early_stopping is False:
|
||||
highest_attainable_score = (best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
|
||||
else:
|
||||
assert early_stopping == "never"
|
||||
if length_penalty > 0.0:
|
||||
# If length_penalty > 0.0, beam search will prefer longer
|
||||
# sequences. The highest attainable score calculation is
|
||||
# based on the longest possible sequence length in this case.
|
||||
max_possible_length = max(best_running_seq.get_prompt_len() + sampling_params.max_tokens,
|
||||
self.scheduler_config.max_model_len)
|
||||
highest_attainable_score = (best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id,
|
||||
seq_len=max_possible_length))
|
||||
else:
|
||||
# Otherwise, beam search will prefer shorter sequences. The
|
||||
# highest attainable score calculation is based on the current
|
||||
# sequence length.
|
||||
highest_attainable_score = (best_running_seq.get_beam_search_score(
|
||||
length_penalty=length_penalty,
|
||||
eos_token_id=self.get_tokenizer_for_seq(best_running_seq).eos_token_id))
|
||||
|
||||
def _process_sequence_group_outputs(self, seq_group: SequenceGroup, outputs: SequenceGroupOutput) -> None:
|
||||
|
||||
# Process prompt logprobs
|
||||
prompt_logprobs = outputs.prompt_logprobs
|
||||
if prompt_logprobs is not None:
|
||||
seq_group.prompt_logprobs = prompt_logprobs
|
||||
|
||||
# Process samples
|
||||
samples = outputs.samples
|
||||
parent_seqs = seq_group.get_seqs(status=SequenceStatus.RUNNING)
|
||||
existing_finished_seqs = seq_group.get_finished_seqs()
|
||||
parent_child_dict = {parent_seq.seq_id: [] for parent_seq in parent_seqs}
|
||||
for sample in samples:
|
||||
parent_child_dict[sample.parent_seq_id].append(sample)
|
||||
# List of (child, parent)
|
||||
child_seqs: List[Tuple[Sequence, Sequence]] = []
|
||||
|
||||
# Process the child samples for each parent sequence
|
||||
for parent in parent_seqs:
|
||||
child_samples: List[SequenceOutput] = parent_child_dict[parent.seq_id]
|
||||
if len(child_samples) == 0:
|
||||
# This parent sequence has no children samples. Remove
|
||||
# the parent sequence from the sequence group since it will
|
||||
# not be used in the future iterations.
|
||||
parent.status = SequenceStatus.FINISHED_ABORTED
|
||||
seq_group.remove(parent.seq_id)
|
||||
self.scheduler.free_seq(parent)
|
||||
continue
|
||||
# Fork the parent sequence if there are multiple child samples.
|
||||
for child_sample in child_samples[:-1]:
|
||||
new_child_seq_id = next(self.seq_counter)
|
||||
child = parent.fork(new_child_seq_id)
|
||||
child.append_token_id(child_sample.output_token, child_sample.logprobs)
|
||||
child_seqs.append((child, parent))
|
||||
# Continue the parent sequence for the last child sample.
|
||||
# We reuse the parent sequence here to reduce redundant memory
|
||||
# copies, especially when using non-beam search sampling methods.
|
||||
last_child_sample = child_samples[-1]
|
||||
parent.append_token_id(last_child_sample.output_token, last_child_sample.logprobs)
|
||||
child_seqs.append((parent, parent))
|
||||
|
||||
for seq, _ in child_seqs:
|
||||
# self._decode_sequence(seq, seq_group.sampling_params)
|
||||
self._check_stop(seq, seq_group.sampling_params)
|
||||
|
||||
# Non-beam search case
|
||||
if not seq_group.sampling_params.use_beam_search:
|
||||
# For newly created child sequences, add them to the sequence group
|
||||
# and fork them in block manager if they are not finished.
|
||||
for seq, parent in child_seqs:
|
||||
if seq is not parent:
|
||||
seq_group.add(seq)
|
||||
if not seq.is_finished():
|
||||
self.scheduler.fork_seq(parent, seq)
|
||||
|
||||
# Free the finished and selected parent sequences' memory in block
|
||||
# manager. Keep them in the sequence group as candidate output.
|
||||
# NOTE: we need to fork the new sequences before freeing the
|
||||
# old sequences.
|
||||
for seq, parent in child_seqs:
|
||||
if seq is parent and seq.is_finished():
|
||||
self.scheduler.free_seq(seq)
|
||||
return
|
||||
|
||||
# Beam search case
|
||||
# Select the child sequences to keep in the sequence group.
|
||||
selected_child_seqs = []
|
||||
unselected_child_seqs = []
|
||||
beam_width = seq_group.sampling_params.best_of
|
||||
length_penalty = seq_group.sampling_params.length_penalty
|
||||
|
||||
# Select the newly finished sequences with the highest scores
|
||||
# to replace existing finished sequences.
|
||||
# Tuple of (seq, parent, is_new)
|
||||
existing_finished_seqs = [(seq, None, False) for seq in existing_finished_seqs]
|
||||
new_finished_seqs = [(seq, parent, True) for seq, parent in child_seqs if seq.is_finished()]
|
||||
all_finished_seqs = existing_finished_seqs + new_finished_seqs
|
||||
# Sort the finished sequences by their scores.
|
||||
all_finished_seqs.sort(key=lambda x: x[0].get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
|
||||
reverse=True)
|
||||
for seq, parent, is_new in all_finished_seqs[:beam_width]:
|
||||
if is_new:
|
||||
# A newly generated child sequence finishes and has a high
|
||||
# score, so we will add it into the sequence group.
|
||||
selected_child_seqs.append((seq, parent))
|
||||
for seq, parent, is_new in all_finished_seqs[beam_width:]:
|
||||
if is_new:
|
||||
# A newly generated child sequence finishes but has a low
|
||||
# score, so we will not add it into the sequence group.
|
||||
# Additionally, if this sequence is a continuation of a
|
||||
# parent sequence, we will need remove the parent sequence
|
||||
# from the sequence group.
|
||||
unselected_child_seqs.append((seq, parent))
|
||||
else:
|
||||
# An existing finished sequence has a low score, so we will
|
||||
# remove it from the sequence group.
|
||||
seq_group.remove(seq.seq_id)
|
||||
|
||||
# select the top beam_width sequences from the running
|
||||
# sequences for the next iteration to continue the beam
|
||||
# search.
|
||||
running_child_seqs = [(seq, parent) for seq, parent in child_seqs if not seq.is_finished()]
|
||||
# Sort the running sequences by their scores.
|
||||
running_child_seqs.sort(key=lambda x: x[0].get_beam_search_score(
|
||||
length_penalty=length_penalty, eos_token_id=self.get_tokenizer_for_seq(x[0]).eos_token_id),
|
||||
reverse=True)
|
||||
|
||||
# Check if we can stop the beam search.
|
||||
if len(running_child_seqs) == 0:
|
||||
# No running sequences, stop the beam search.
|
||||
stop_beam_search = True
|
||||
elif len(all_finished_seqs) < beam_width:
|
||||
# Not enough finished sequences, continue the beam search.
|
||||
stop_beam_search = False
|
||||
else:
|
||||
# Check the early stopping criteria
|
||||
best_running_seq = running_child_seqs[0][0]
|
||||
current_worst_seq = all_finished_seqs[beam_width - 1][0]
|
||||
stop_beam_search = self._check_beam_search_early_stopping(seq_group.sampling_params.early_stopping,
|
||||
seq_group.sampling_params, best_running_seq,
|
||||
current_worst_seq)
|
||||
|
||||
if stop_beam_search:
|
||||
# Stop the beam search and remove all the running sequences from
|
||||
# the sequence group.
|
||||
unselected_child_seqs.extend(running_child_seqs)
|
||||
else:
|
||||
# Continue the beam search and select the top beam_width sequences
|
||||
# to continue the beam search.
|
||||
selected_child_seqs.extend(running_child_seqs[:beam_width])
|
||||
# The remaining running sequences will not be used in the next
|
||||
# iteration. Again, if these sequences are continuations of
|
||||
# parent sequences, we will need to remove the parent sequences
|
||||
# from the sequence group.
|
||||
unselected_child_seqs.extend(running_child_seqs[beam_width:])
|
||||
|
||||
# For newly created child sequences, add them to the sequence group
|
||||
# and fork them in block manager if they are not finished.
|
||||
for seq, parent in selected_child_seqs:
|
||||
if seq is not parent:
|
||||
seq_group.add(seq)
|
||||
if not seq.is_finished():
|
||||
self.scheduler.fork_seq(parent, seq)
|
||||
|
||||
# Free the finished and selected parent sequences' memory in block
|
||||
# manager. Keep them in the sequence group as candidate output.
|
||||
for seq, parent in selected_child_seqs:
|
||||
if seq is parent and seq.is_finished():
|
||||
self.scheduler.free_seq(seq)
|
||||
|
||||
# Remove the unselected parent sequences from the sequence group and
|
||||
# free their memory in block manager.
|
||||
for seq, parent in unselected_child_seqs:
|
||||
if seq is parent:
|
||||
# Remove the parent sequence if it is not selected for next
|
||||
# iteration
|
||||
seq_group.remove(seq.seq_id)
|
||||
self.scheduler.free_seq(seq)
|
||||
|
||||
def _process_model_outputs(self, output: SamplerOutput, scheduler_outputs: SchedulerOutputs) -> List[RequestOutput]:
|
||||
# Update the scheduled sequence groups with the model outputs.
|
||||
scheduled_seq_groups = scheduler_outputs.scheduled_seq_groups
|
||||
for seq_group, outputs in zip(scheduled_seq_groups, output):
|
||||
self._process_sequence_group_outputs(seq_group, outputs)
|
||||
|
||||
# Free the finished sequence groups.
|
||||
self.scheduler.free_finished_seq_groups()
|
||||
|
||||
# Create the outputs.
|
||||
request_outputs: List[RequestOutput] = []
|
||||
for seq_group in scheduled_seq_groups:
|
||||
request_output = RequestOutput.from_seq_group(seq_group)
|
||||
request_outputs.append(request_output)
|
||||
for seq_group in scheduler_outputs.ignored_seq_groups:
|
||||
request_output = RequestOutput.from_seq_group(seq_group)
|
||||
request_outputs.append(request_output)
|
||||
|
||||
# Update prefix state, now all the uncomputed prefixes are computed.
|
||||
for seq_group in scheduled_seq_groups:
|
||||
if (seq_group.prefix is not None and seq_group.prefix.allocated and not seq_group.prefix.computed):
|
||||
seq_group.prefix.computed = True
|
||||
|
||||
# Log stats.
|
||||
if self.log_stats:
|
||||
self.stat_logger.log(self._get_stats(scheduler_outputs))
|
||||
|
||||
return request_outputs
|
||||
|
||||
def step(self) -> List[RequestOutput]:
|
||||
"""Performs one decoding iteration and returns newly generated results.
|
||||
|
||||
This function performs one decoding iteration of the engine. It first
|
||||
schedules the sequences to be executed in the next iteration and the
|
||||
token blocks to be swapped in/out/copy. Then, it executes the model
|
||||
and updates the scheduler with the model outputs. Finally, it decodes
|
||||
the sequences and returns the newly generated results.
|
||||
"""
|
||||
seq_group_metadata_list, scheduler_outputs = self.scheduler.schedule()
|
||||
if not scheduler_outputs.is_empty():
|
||||
output = self.worker.execute_model(
|
||||
seq_group_metadata_list=seq_group_metadata_list, # TODO: check this input
|
||||
blocks_to_swap_in=scheduler_outputs.blocks_to_swap_in,
|
||||
blocks_to_swap_out=scheduler_outputs.blocks_to_swap_out,
|
||||
blocks_to_copy=scheduler_outputs.blocks_to_copy,)
|
||||
else:
|
||||
return [RequestOutput.from_seq_group(seq_group) for seq_group in scheduler_outputs.ignored_seq_groups]
|
||||
|
||||
return self._process_model_outputs(output, scheduler_outputs)
|
||||
|
||||
def do_log_stats(self) -> None:
|
||||
"""Forced log when no requests active."""
|
||||
if self.log_stats:
|
||||
self.stat_logger.log(self._get_stats(scheduler_outputs=None))
|
||||
|
||||
def _get_stats(self, scheduler_outputs: Optional[SchedulerOutputs]) -> Stats:
|
||||
"""Get Stats to be Logged to Prometheus."""
|
||||
now = time.monotonic()
|
||||
|
||||
# KV Cache Usage in %.
|
||||
num_total_gpu = self.cache_config.num_gpu_blocks
|
||||
num_free_gpu = self.scheduler.block_manager.get_num_free_gpu_blocks()
|
||||
gpu_cache_usage = 1.0 - (num_free_gpu / num_total_gpu)
|
||||
|
||||
num_total_cpu = self.cache_config.num_cpu_blocks
|
||||
cpu_cache_usage = 0.
|
||||
if num_total_cpu > 0:
|
||||
num_free_cpu = self.scheduler.block_manager.get_num_free_cpu_blocks()
|
||||
cpu_cache_usage = 1.0 - (num_free_cpu / num_total_cpu)
|
||||
|
||||
# Scheduler State
|
||||
num_running = len(self.scheduler.running)
|
||||
num_swapped = len(self.scheduler.swapped)
|
||||
num_waiting = len(self.scheduler.waiting)
|
||||
|
||||
# Iteration stats if we have scheduler output.
|
||||
num_prompt_tokens = 0
|
||||
num_generation_tokens = 0
|
||||
time_to_first_tokens = []
|
||||
time_per_output_tokens = []
|
||||
time_e2e_requests = []
|
||||
if scheduler_outputs is not None:
|
||||
prompt_run = scheduler_outputs.prompt_run
|
||||
|
||||
# Number of Tokens.
|
||||
if prompt_run:
|
||||
num_prompt_tokens = scheduler_outputs.num_batched_tokens
|
||||
else:
|
||||
num_generation_tokens = scheduler_outputs.num_batched_tokens
|
||||
|
||||
# Latency Timings.
|
||||
time_last_iters = []
|
||||
for seq_group in scheduler_outputs.scheduled_seq_groups:
|
||||
# Time since last token. (n.b. updates seq_group.last_token_time)
|
||||
time_last_iters.append(seq_group.get_last_latency(now))
|
||||
# Time since arrival for all finished requests.
|
||||
if seq_group.is_finished():
|
||||
time_e2e_requests.append(now - seq_group.arrival_time)
|
||||
|
||||
time_to_first_tokens = time_last_iters if prompt_run else []
|
||||
time_per_output_tokens = [] if prompt_run else time_last_iters
|
||||
|
||||
return Stats(
|
||||
now=now,
|
||||
num_running=num_running,
|
||||
num_swapped=num_swapped,
|
||||
num_waiting=num_waiting,
|
||||
gpu_cache_usage=gpu_cache_usage,
|
||||
cpu_cache_usage=cpu_cache_usage,
|
||||
num_prompt_tokens=num_prompt_tokens,
|
||||
num_generation_tokens=num_generation_tokens,
|
||||
time_to_first_tokens=time_to_first_tokens,
|
||||
time_per_output_tokens=time_per_output_tokens,
|
||||
time_e2e_requests=time_e2e_requests,
|
||||
)
|
||||
|
||||
# TODO: we may not need to decode
|
||||
def _decode_sequence(self, seq: Sequence, prms: SamplingParams) -> None:
|
||||
"""Decodes the new token for a sequence."""
|
||||
(new_tokens, new_output_text, prefix_offset, read_offset) = detokenize_incrementally(
|
||||
self.get_tokenizer_for_seq(seq),
|
||||
all_input_ids=seq.get_token_ids(),
|
||||
prev_tokens=seq.tokens,
|
||||
prefix_offset=seq.prefix_offset,
|
||||
read_offset=seq.read_offset,
|
||||
skip_special_tokens=prms.skip_special_tokens,
|
||||
spaces_between_special_tokens=prms.spaces_between_special_tokens,
|
||||
)
|
||||
if seq.tokens is None:
|
||||
seq.tokens = new_tokens
|
||||
else:
|
||||
seq.tokens.extend(new_tokens)
|
||||
seq.prefix_offset = prefix_offset
|
||||
seq.read_offset = read_offset
|
||||
seq.output_text += new_output_text
|
||||
|
||||
def _check_stop(self, seq: Sequence, sampling_params: SamplingParams) -> None:
|
||||
"""Stop the finished sequences."""
|
||||
# for stop_str in sampling_params.stop:
|
||||
# if seq.output_text.endswith(stop_str):
|
||||
# self._finalize_sequence(seq, sampling_params, stop_str)
|
||||
# seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
# return
|
||||
# if seq.get_last_token_id() in sampling_params.stop_token_ids:
|
||||
# stop_str = self.get_tokenizer_for_seq(seq).convert_ids_to_tokens(seq.get_last_token_id())
|
||||
# self._finalize_sequence(seq, sampling_params, stop_str)
|
||||
# seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
# return
|
||||
|
||||
# Check if the sequence has reached max_model_len.
|
||||
if seq.get_len() > self.scheduler_config.max_model_len:
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
# Check if the sequence has reached max_tokens.
|
||||
if seq.get_output_len() == sampling_params.max_tokens:
|
||||
seq.status = SequenceStatus.FINISHED_LENGTH_CAPPED
|
||||
return
|
||||
|
||||
# Check if the sequence has generated the EOS token.
|
||||
if ((not sampling_params.ignore_eos) and
|
||||
seq.get_last_token_id() == self.get_tokenizer_for_seq(seq).eos_token_id):
|
||||
seq.status = SequenceStatus.FINISHED_STOPPED
|
||||
return
|
||||
|
||||
def _finalize_sequence(self, seq: Sequence, sampling_params: SamplingParams, stop_string: str) -> None:
|
||||
if not sampling_params.include_stop_str_in_output and stop_string:
|
||||
# Truncate the output text so that the stop string is
|
||||
# not included in the output.
|
||||
seq.output_text = seq.output_text[:-len(stop_string)]
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
assert lora_request.lora_int_id > 0, "lora_id must be greater than 0."
|
||||
return self.worker.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
assert lora_id > 0, "lora_id must be greater than 0."
|
||||
return self.worker.remove_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> List[int]:
|
||||
return self.worker.list_loras()
|
||||
|
||||
def sync_model_weights(self, actor_weights: Dict[str, torch.Tensor]) -> None:
|
||||
self.worker.sync_model_weights(actor_weights=actor_weights)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
self.worker.offload_model_weights()
|
||||
|
||||
|
||||
def initialize_cluster(
|
||||
parallel_config: ParallelConfig,
|
||||
engine_use_ray: bool = False,
|
||||
ray_address: Optional[str] = None,
|
||||
) -> Tuple[str, Optional[None]]:
|
||||
"""Initialize the distributed cluster probably with Ray.
|
||||
|
||||
Args:
|
||||
parallel_config: The configurations for parallel execution.
|
||||
engine_use_ray: Whether to use Ray for async engine.
|
||||
ray_address: The address of the Ray cluster. If None, uses
|
||||
the default Ray cluster address.
|
||||
|
||||
Returns:
|
||||
A tuple of (`distributed_init_method`, `placement_group`). The
|
||||
`distributed_init_method` is the address for initializing the
|
||||
distributed backend. `placement_group` includes the specification
|
||||
of the resources for each distributed worker.
|
||||
"""
|
||||
|
||||
# Initialize cluster locally.
|
||||
port = get_open_port()
|
||||
# We need to setup the distributed init method to make sure
|
||||
# the distributed megatron code (e.g., get world size) works correctly.
|
||||
distributed_init_method = f"tcp://localhost:{port}"
|
||||
return distributed_init_method, None
|
||||
|
||||
|
||||
def get_open_port():
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
s.bind(("", 0))
|
||||
return s.getsockname()[1]
|
||||
275
verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
vendored
Normal file
275
verl/third_party/vllm/vllm_v_0_3_1/model_loader.py
vendored
Normal file
@@ -0,0 +1,275 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/model_loader
|
||||
"""Utilities for selecting and loading models."""
|
||||
import contextlib
|
||||
from typing import Dict, Type, Union
|
||||
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
from transformers import PretrainedConfig, PreTrainedModel
|
||||
from megatron.core.tensor_parallel.utils import VocabUtility
|
||||
|
||||
from vllm.model_executor.models import ModelRegistry
|
||||
from vllm.model_executor.weight_utils import (get_quant_config, initialize_dummy_weights)
|
||||
|
||||
from .config import ModelConfig
|
||||
from vllm.config import DeviceConfig, LoRAConfig
|
||||
from .weight_loaders import *
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata, SamplingTensors
|
||||
from vllm.sequence import SamplerOutput
|
||||
from typing import Optional
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
from vllm.model_executor.layers.sampler import _prune_hidden_states, _apply_logits_processors, _apply_penalties, _apply_top_k_top_p, _apply_min_p, _apply_penalties, _sample, _get_logprobs, _build_sampler_output
|
||||
|
||||
|
||||
@contextlib.contextmanager
|
||||
def _set_default_torch_dtype(dtype: torch.dtype):
|
||||
"""Sets the default torch dtype to the given dtype."""
|
||||
old_dtype = torch.get_default_dtype()
|
||||
torch.set_default_dtype(dtype)
|
||||
yield
|
||||
torch.set_default_dtype(old_dtype)
|
||||
|
||||
|
||||
def _get_model_architecture(config: PretrainedConfig) -> Type[nn.Module]:
|
||||
architectures = getattr(config, "architectures", [])
|
||||
for arch in architectures:
|
||||
model_cls = ModelRegistry.load_model_cls(arch)
|
||||
if model_cls is not None:
|
||||
return model_cls
|
||||
raise ValueError(f"Model architectures {architectures} are not supported for now. "
|
||||
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
|
||||
|
||||
|
||||
from vllm.model_executor.layers.linear import *
|
||||
from vllm.model_executor.layers.vocab_parallel_embedding import VocabParallelEmbedding, ParallelLMHead
|
||||
from vllm.model_executor.layers.activation import ScaledActivation
|
||||
|
||||
__LAYER_WEIGHT_LOADER_REGISTRY__ = {
|
||||
ColumnParallelLinear: parallel_weight_loader,
|
||||
MergedColumnParallelLinear: parallel_weight_loader,
|
||||
QKVParallelLinear: parallel_weight_loader,
|
||||
RowParallelLinear: parallel_weight_loader,
|
||||
VocabParallelEmbedding: parallel_weight_loader,
|
||||
ParallelLMHead: parallel_weight_loader
|
||||
# "ScaledActivation.weight_loader": ScaledActivation, # TODO(shengguangming): latest commit in vllm fix awq for this function and add load_weights
|
||||
# "default_weight_loader": default_weight_loader
|
||||
}
|
||||
|
||||
# NOTE(gmsheng): change the weight_loader function in runtime
|
||||
for layer_class, weight_loader in __LAYER_WEIGHT_LOADER_REGISTRY__.items():
|
||||
layer_class.weight_loader = weight_loader
|
||||
|
||||
__MODEL_WEIGHT_LOADER_REGISTRY__ = {
|
||||
'GPT2LMHeadModel': gpt2_weight_loader,
|
||||
'LlamaForCausalLM': llama_weight_loader,
|
||||
'LLaMAForCausalLM': llama_weight_loader,
|
||||
'MistralForCausalLM': mistral_weight_loader,
|
||||
}
|
||||
|
||||
# FIXME(shengguangming): the vLLM vocab will pad to 64, which may incur out of bounds
|
||||
# so we need to rewrite the init function of vocab
|
||||
DEFAULT_VOCAB_PADDING_SIZE = 64
|
||||
|
||||
|
||||
def vocab_init(self,
|
||||
num_embeddings: int,
|
||||
embedding_dim: int,
|
||||
params_dtype: Optional[torch.dtype] = None,
|
||||
org_num_embeddings: Optional[int] = None,
|
||||
padding_size: int = DEFAULT_VOCAB_PADDING_SIZE):
|
||||
super(VocabParallelEmbedding, self).__init__()
|
||||
|
||||
# Keep the input dimensions.
|
||||
# TODO (pad to be divided by 4)
|
||||
self.num_embeddings = num_embeddings
|
||||
self.org_vocab_size = org_num_embeddings or num_embeddings
|
||||
|
||||
# self.num_embeddings_padded = pad_vocab_size(num_embeddings,
|
||||
# padding_size)
|
||||
self.embedding_dim = embedding_dim
|
||||
if params_dtype is None:
|
||||
params_dtype = torch.get_default_dtype()
|
||||
self.tp_size = get_tensor_model_parallel_world_size()
|
||||
# Divide the weight matrix along the vocaburaly dimension.
|
||||
|
||||
self.vocab_start_index, self.vocab_end_index = (VocabUtility.vocab_range_from_global_vocab_size(
|
||||
self.num_embeddings, get_tensor_model_parallel_rank(), self.tp_size))
|
||||
self.num_embeddings_per_partition = (self.vocab_end_index - self.vocab_start_index)
|
||||
self.weight = Parameter(
|
||||
torch.empty(
|
||||
self.num_embeddings_per_partition,
|
||||
self.embedding_dim,
|
||||
# device=torch.cuda.current_device(),
|
||||
dtype=params_dtype))
|
||||
set_weight_attrs(self.weight, {"parallel_dim": 0, "weight_loader": self.weight_loader})
|
||||
|
||||
|
||||
VocabParallelEmbedding.__init__ = vocab_init
|
||||
|
||||
|
||||
def _get_model_weight_loader(arch: str):
|
||||
if arch in __MODEL_WEIGHT_LOADER_REGISTRY__:
|
||||
return __MODEL_WEIGHT_LOADER_REGISTRY__[arch]
|
||||
raise ValueError(f"Model architectures {arch} are not supported for now. "
|
||||
f"Supported architectures: {ModelRegistry.get_supported_archs()}")
|
||||
|
||||
|
||||
def get_model(actor_model: Union[PreTrainedModel, Dict],
|
||||
model_config: ModelConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig] = None) -> nn.Module:
|
||||
model_class = _get_model_architecture(model_config.hf_config)
|
||||
|
||||
# Get the quantization config.
|
||||
linear_method = None
|
||||
quant_config = None
|
||||
if model_config.quantization is not None:
|
||||
quant_config = get_quant_config(model_config.quantization, model_config.model, model_config.hf_config,
|
||||
model_config.download_dir)
|
||||
capability = torch.cuda.get_device_capability()
|
||||
capability = capability[0] * 10 + capability[1]
|
||||
if capability < quant_config.get_min_capability():
|
||||
raise ValueError(f"The quantization method {model_config.quantization} is not "
|
||||
"supported for the current GPU. "
|
||||
f"Minimum capability: {quant_config.get_min_capability()}. "
|
||||
f"Current capability: {capability}.")
|
||||
supported_dtypes = quant_config.get_supported_act_dtypes()
|
||||
if model_config.dtype not in supported_dtypes:
|
||||
raise ValueError(f"{model_config.dtype} is not supported for quantization "
|
||||
f"method {model_config.quantization}. Supported dtypes: "
|
||||
f"{supported_dtypes}")
|
||||
linear_method = quant_config.get_linear_method()
|
||||
|
||||
with _set_default_torch_dtype(model_config.dtype):
|
||||
# Create a model instance.
|
||||
# The weights will be initialized as empty tensors.
|
||||
# with torch.device(device_config.device):
|
||||
# NOTE(sgm): init the model in cpu
|
||||
model = model_class(model_config.hf_config, linear_method)
|
||||
|
||||
if model_config.load_format == "dummy":
|
||||
model = model.cuda()
|
||||
# NOTE(woosuk): For accurate performance evaluation, we assign
|
||||
# random values to the weights.
|
||||
initialize_dummy_weights(model)
|
||||
elif model_config.load_format == 'model' or model_config.load_format == 'auto':
|
||||
# NOTE(shengguangming) Load the weights from the actor model
|
||||
if isinstance(actor_model, nn.Module):
|
||||
load_weights(actor_weights=dict(actor_model.named_parameters(remove_duplicate=False)), vllm_model=model)
|
||||
else:
|
||||
load_weights(actor_weights=actor_model, vllm_model=model)
|
||||
|
||||
# NOTE(sgm) Some weights are point to gpu, but still need this.
|
||||
model = model.cuda() # NOTE (zhangchi.usc1992) We need this for vllm to profile memory usage
|
||||
return model.eval()
|
||||
|
||||
|
||||
# the actor model is .state_dict()
|
||||
def load_weights(actor_weights: Dict, vllm_model: nn.Module):
|
||||
weight_loader = _get_model_weight_loader(vllm_model.__class__.__name__)
|
||||
weight_loader(actor_weights, vllm_model)
|
||||
# NOTE(sgm) to reduce peak memory usage, we offload vllm model to cpu
|
||||
# after init, and we need this after sync model weights for in first iter.
|
||||
vllm_model = vllm_model.cuda()
|
||||
|
||||
|
||||
# FIXME(sgm): hack the Sampler function in vllm v0.3.1
|
||||
# as they use ray, the sampler result will only need to return to the driver node,
|
||||
# therefore gather is enough. However, we use SPMD instead of a central scheduler,
|
||||
# all_gather is required (aligned with v0.2.6)
|
||||
def _get_logits(self, hidden_states: torch.Tensor, embedding: torch.Tensor,
|
||||
embedding_bias: Optional[torch.Tensor]) -> torch.Tensor:
|
||||
# Get the logits for the next tokens.
|
||||
logits = torch.matmul(hidden_states, embedding.t())
|
||||
if embedding_bias is not None:
|
||||
logits += embedding_bias
|
||||
logits = tensor_model_parallel_all_gather(logits)
|
||||
# Remove paddings in vocab (if any).
|
||||
if logits is not None:
|
||||
logits = logits[:, :self.org_vocab_size]
|
||||
return logits
|
||||
|
||||
|
||||
def forward(
|
||||
self,
|
||||
embedding: torch.Tensor,
|
||||
hidden_states: torch.Tensor,
|
||||
sampling_metadata: SamplingMetadata,
|
||||
embedding_bias: Optional[torch.Tensor] = None,
|
||||
) -> Optional[SamplerOutput]:
|
||||
# Get the hidden states that we use for sampling.
|
||||
hidden_states = _prune_hidden_states(hidden_states, sampling_metadata)
|
||||
|
||||
# Get the logits for the next tokens.
|
||||
logits = self._get_logits(hidden_states, embedding, embedding_bias)
|
||||
# save origin logprobs for sampler_output
|
||||
origin_logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
|
||||
|
||||
# Only perform sampling in the driver worker.
|
||||
# Note: `_get_logits` is still distributed across TP workers because
|
||||
# the `embedding` weight is distributed across TP workers.
|
||||
# TODO(zhuohan): Change the get_logits part to a separate stage.
|
||||
if not sampling_metadata.perform_sampling:
|
||||
return None
|
||||
|
||||
assert logits is not None
|
||||
_, vocab_size = logits.shape
|
||||
|
||||
# Apply logits processors (if any).
|
||||
logits = _apply_logits_processors(logits, sampling_metadata)
|
||||
|
||||
# Prepare sampling tensors with pinned memory to avoid blocking.
|
||||
(sampling_tensors, do_penalties, do_top_p_top_k,
|
||||
do_min_p) = SamplingTensors.from_sampling_metadata(sampling_metadata, vocab_size, logits.device, logits.dtype)
|
||||
|
||||
# Apply presence and frequency penalties.
|
||||
if do_penalties:
|
||||
logits = _apply_penalties(logits, sampling_tensors.prompt_tokens, sampling_tensors.output_tokens,
|
||||
sampling_tensors.presence_penalties, sampling_tensors.frequency_penalties,
|
||||
sampling_tensors.repetition_penalties)
|
||||
|
||||
# Apply temperature scaling.
|
||||
# Use in-place division to avoid creating a new tensor.
|
||||
logits.div_(sampling_tensors.temperatures.unsqueeze_(dim=1))
|
||||
|
||||
if do_top_p_top_k:
|
||||
logits = _apply_top_k_top_p(logits, sampling_tensors.top_ps, sampling_tensors.top_ks)
|
||||
|
||||
if do_min_p:
|
||||
logits = _apply_min_p(logits, sampling_tensors.min_ps)
|
||||
|
||||
# We use float32 for probabilities and log probabilities.
|
||||
# Compute the probabilities.
|
||||
probs = torch.softmax(logits, dim=-1, dtype=torch.float)
|
||||
# Compute the log probabilities.
|
||||
# Use log_softmax to ensure numerical stability.
|
||||
logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
|
||||
|
||||
# Sample the next tokens.
|
||||
sample_results = _sample(probs, logprobs, sampling_metadata)
|
||||
|
||||
# Get the logprobs query results.
|
||||
# prompt_logprobs, sample_logprobs = _get_logprobs(
|
||||
# logprobs, sampling_metadata, sample_results)
|
||||
prompt_logprobs, sample_logprobs = _get_logprobs(origin_logprobs, sampling_metadata, sample_results)
|
||||
|
||||
return _build_sampler_output(sample_results, sampling_metadata, prompt_logprobs, sample_logprobs)
|
||||
|
||||
|
||||
from vllm.model_executor.layers.sampler import Sampler
|
||||
|
||||
Sampler._get_logits = _get_logits
|
||||
Sampler.forward = forward
|
||||
285
verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
vendored
Normal file
285
verl/third_party/vllm/vllm_v_0_3_1/model_runner.py
vendored
Normal file
@@ -0,0 +1,285 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/model_runner.py
|
||||
|
||||
from typing import Dict, List, Optional, Tuple, Set, Union
|
||||
import contextlib
|
||||
import time
|
||||
import numpy as np
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import (DeviceConfig, ModelConfig, LoRAConfig, ParallelConfig, SchedulerConfig)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.model_executor import InputMetadata, SamplingMetadata
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
|
||||
from vllm.lora.worker_manager import LRUCacheWorkerLoRAManager
|
||||
from vllm.lora.layers import LoRAMapping
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import in_wsl
|
||||
from vllm.worker.model_runner import ModelRunner, CUDAGraphRunner, _async_h2d
|
||||
|
||||
from .model_loader import get_model
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
KVCache = Tuple[torch.Tensor, torch.Tensor]
|
||||
_PAD_SLOT_ID = -1
|
||||
LORA_WARMUP_RANK = 8
|
||||
# Capture graphs for batch size 1, 2, 4, 8, 16, 24, 32, 40, ..., 256.
|
||||
# NOTE: _get_graph_batch_size needs to be updated if this list is changed.
|
||||
_BATCH_SIZES_TO_CAPTURE = [1, 2, 4] + [8 * i for i in range(1, 33)]
|
||||
|
||||
|
||||
class ModelRunner(ModelRunner):
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
lora_config: Optional[LoRAConfig],
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
):
|
||||
self.model_config = model_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.lora_config = lora_config
|
||||
|
||||
# model_config can be None in tests/samplers/test_sampler.py.
|
||||
# FIXME(woosuk): This is a hack to make the tests work. Refactor this.
|
||||
self.sliding_window = (model_config.get_sliding_window() if model_config is not None else None)
|
||||
|
||||
self.device_config = (device_config if device_config is not None else DeviceConfig())
|
||||
self.device = self.device_config.device
|
||||
|
||||
self.model = model # this will be replaced by get_model()
|
||||
self.block_size = None # Set after initial profiling.
|
||||
self.lora_manager = None
|
||||
|
||||
self.graph_runners: Dict[int, CUDAGraphRunner] = {}
|
||||
self.graph_memory_pool = None # Set during graph capture.
|
||||
|
||||
self.max_context_len_to_capture = (self.model_config.max_context_len_to_capture
|
||||
if self.model_config is not None else 0)
|
||||
# When using CUDA graph, the input block tables must be padded to
|
||||
# max_context_len_to_capture. However, creating the block table in
|
||||
# Python can be expensive. To optimize this, we cache the block table
|
||||
# in numpy and only copy the actual input content at every iteration.
|
||||
# The shape of the cached block table will be
|
||||
# (max batch size to capture, max context len to capture / block size).
|
||||
self.graph_block_tables = None # Set after initial profiling.
|
||||
# cache in_wsl result
|
||||
self.in_wsl = in_wsl()
|
||||
self.kv_cache_dtype = kv_cache_dtype
|
||||
|
||||
def load_model(self) -> None:
|
||||
self.model = get_model(actor_model=self.model,
|
||||
model_config=self.model_config,
|
||||
device_config=self.device_config,
|
||||
lora_config=self.lora_config)
|
||||
vocab_size = self.model.config.vocab_size
|
||||
|
||||
if self.lora_config:
|
||||
assert hasattr(
|
||||
self.model,
|
||||
"supported_lora_modules") and self.model.supported_lora_modules, "Model does not support LoRA"
|
||||
assert hasattr(self.model, "embedding_modules"), "Model does not have embedding_modules"
|
||||
assert hasattr(self.model, "embedding_padding_modules"), "Model does not have embedding_padding_modules"
|
||||
self.lora_manager = LRUCacheWorkerLoRAManager(
|
||||
self.scheduler_config.max_num_seqs,
|
||||
self.scheduler_config.max_num_batched_tokens + self.scheduler_config.max_paddings, vocab_size,
|
||||
self.lora_config, self.device, self.model.embedding_modules, self.model.embedding_padding_modules)
|
||||
self.model = self.lora_manager.create_lora_manager(self.model)
|
||||
|
||||
def _prepare_sample(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
prompt_lens: List[int],
|
||||
subquery_lens: Optional[List[int]],
|
||||
) -> SamplingMetadata:
|
||||
seq_groups: List[Tuple[List[int], SamplingParams]] = []
|
||||
selected_token_indices: List[int] = []
|
||||
selected_token_start_idx = 0
|
||||
categorized_sample_indices = {t: [] for t in SamplingType}
|
||||
categorized_sample_indices_start_idx = 0
|
||||
|
||||
max_subquery_len = max(subquery_lens) if subquery_lens else 1
|
||||
for i, seq_group_metadata in enumerate(seq_group_metadata_list):
|
||||
seq_ids = list(seq_group_metadata.seq_data.keys())
|
||||
sampling_params = seq_group_metadata.sampling_params
|
||||
seq_groups.append((seq_ids, sampling_params))
|
||||
|
||||
if seq_group_metadata.is_prompt:
|
||||
assert len(seq_ids) == 1
|
||||
assert subquery_lens is not None
|
||||
subquery_len = subquery_lens[i]
|
||||
if sampling_params.prompt_logprobs is not None:
|
||||
# NOTE: prompt token positions do not need sample, skip
|
||||
categorized_sample_indices_start_idx += subquery_len - 1
|
||||
|
||||
categorized_sample_indices[sampling_params.sampling_type].append(categorized_sample_indices_start_idx)
|
||||
categorized_sample_indices_start_idx += 1
|
||||
|
||||
if sampling_params.prompt_logprobs is not None:
|
||||
selected_token_indices.extend(
|
||||
range(selected_token_start_idx, selected_token_start_idx + subquery_len - 1))
|
||||
selected_token_indices.append(selected_token_start_idx + subquery_len - 1)
|
||||
selected_token_start_idx += max_subquery_len
|
||||
else:
|
||||
num_seqs = len(seq_ids)
|
||||
selected_token_indices.extend(range(selected_token_start_idx, selected_token_start_idx + num_seqs))
|
||||
selected_token_start_idx += num_seqs
|
||||
|
||||
categorized_sample_indices[sampling_params.sampling_type].extend(
|
||||
range(categorized_sample_indices_start_idx, categorized_sample_indices_start_idx + num_seqs))
|
||||
categorized_sample_indices_start_idx += num_seqs
|
||||
|
||||
selected_token_indices = _async_h2d(selected_token_indices,
|
||||
dtype=torch.long,
|
||||
target_device=self.device,
|
||||
pin_memory=not self.in_wsl)
|
||||
categorized_sample_indices = {
|
||||
t: _async_h2d(seq_ids, dtype=torch.int, target_device=self.device, pin_memory=not self.in_wsl)
|
||||
for t, seq_ids in categorized_sample_indices.items()
|
||||
}
|
||||
|
||||
seq_data: Dict[int, SequenceData] = {}
|
||||
for seq_group_metadata in seq_group_metadata_list:
|
||||
seq_data.update(seq_group_metadata.seq_data)
|
||||
|
||||
sampling_metadata = SamplingMetadata(
|
||||
seq_groups=seq_groups,
|
||||
seq_data=seq_data,
|
||||
prompt_lens=prompt_lens,
|
||||
selected_token_indices=selected_token_indices,
|
||||
categorized_sample_indices=categorized_sample_indices,
|
||||
)
|
||||
return sampling_metadata
|
||||
|
||||
def prepare_input_tensors(
|
||||
self,
|
||||
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
|
||||
) -> Tuple[torch.Tensor, torch.Tensor, InputMetadata, SamplingMetadata, Set[int], LoRAMapping]:
|
||||
# NOTE: We assume that all sequences in the group are all prompts or
|
||||
# all decodes.
|
||||
is_prompt = seq_group_metadata_list[0].is_prompt
|
||||
# Prepare input tensors.
|
||||
if is_prompt:
|
||||
(input_tokens, input_positions, input_metadata, prompt_lens, subquery_lens, lora_index_mapping,
|
||||
lora_prompt_mapping, lora_requests) = self._prepare_prompt(seq_group_metadata_list)
|
||||
else:
|
||||
(input_tokens, input_positions, input_metadata, lora_index_mapping, lora_prompt_mapping,
|
||||
lora_requests) = self._prepare_decode(seq_group_metadata_list)
|
||||
prompt_lens = []
|
||||
subquery_lens = None
|
||||
sampling_metadata = self._prepare_sample(seq_group_metadata_list, prompt_lens, subquery_lens)
|
||||
if self.lora_config:
|
||||
flat_lora_index_mapping = [item for sublist in lora_index_mapping for item in sublist]
|
||||
lora_mapping = LoRAMapping(
|
||||
flat_lora_index_mapping,
|
||||
lora_prompt_mapping,
|
||||
)
|
||||
else:
|
||||
lora_mapping = None
|
||||
|
||||
return (input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests, lora_mapping)
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
seq_group_metadata_list: Optional[List[SequenceGroupMetadata]],
|
||||
kv_caches: List[Tuple[torch.Tensor, torch.Tensor]],
|
||||
) -> Optional[SamplerOutput]:
|
||||
(input_tokens, input_positions, input_metadata, sampling_metadata, lora_requests,
|
||||
lora_mapping) = self.prepare_input_tensors(seq_group_metadata_list)
|
||||
|
||||
if self.lora_config:
|
||||
self.set_active_loras(lora_requests, lora_mapping)
|
||||
|
||||
# Execute the model.
|
||||
if input_metadata.use_cuda_graph:
|
||||
graph_batch_size = input_tokens.shape[0]
|
||||
model_executable = self.graph_runners[graph_batch_size]
|
||||
else:
|
||||
model_executable = self.model
|
||||
hidden_states = model_executable(
|
||||
input_ids=input_tokens,
|
||||
positions=input_positions,
|
||||
kv_caches=kv_caches,
|
||||
input_metadata=input_metadata,
|
||||
)
|
||||
|
||||
# Sample the next token.
|
||||
output = self.model.sample(
|
||||
hidden_states=hidden_states,
|
||||
sampling_metadata=sampling_metadata,
|
||||
)
|
||||
return output
|
||||
|
||||
@torch.inference_mode()
|
||||
def profile_run(self) -> None:
|
||||
# Enable top-k sampling to reflect the accurate memory usage.
|
||||
vocab_size = self.model_config.get_vocab_size()
|
||||
# FIXME(sgm): this sampling params will call cumsum(), causing the
|
||||
# deterministic cumsum throw error
|
||||
sampling_params = SamplingParams(top_p=0.99, top_k=vocab_size - 1)
|
||||
max_num_batched_tokens = self.scheduler_config.max_num_batched_tokens
|
||||
max_num_seqs = self.scheduler_config.max_num_seqs
|
||||
|
||||
# This represents the maximum number of different requests
|
||||
# that will have unique loras, an therefore the max amount of memory
|
||||
# consumption create dummy lora request copies from the lora request
|
||||
# passed in, which contains a lora from the lora warmup path.
|
||||
dummy_lora_requests = []
|
||||
dummy_lora_requests_per_seq = []
|
||||
if self.lora_config:
|
||||
for idx in range(self.lora_config.max_loras):
|
||||
lora_id = idx + 1
|
||||
dummy_lora_request = LoRARequest(
|
||||
lora_name=f"warmup_{lora_id}",
|
||||
lora_int_id=lora_id,
|
||||
lora_local_path="/not/a/real/path",
|
||||
)
|
||||
self.lora_manager.add_dummy_lora(dummy_lora_request, rank=LORA_WARMUP_RANK)
|
||||
dummy_lora_requests.append(dummy_lora_request)
|
||||
dummy_lora_requests_per_seq = [
|
||||
dummy_lora_requests[idx % len(dummy_lora_requests)] for idx in range(max_num_seqs)
|
||||
]
|
||||
|
||||
# Profile memory usage with max_num_sequences sequences and the total
|
||||
# number of tokens equal to max_num_batched_tokens.
|
||||
seqs: List[SequenceGroupMetadata] = []
|
||||
for group_id in range(max_num_seqs):
|
||||
seq_len = (max_num_batched_tokens // max_num_seqs + (group_id < max_num_batched_tokens % max_num_seqs))
|
||||
seq_data = SequenceData([0] * seq_len)
|
||||
seq = SequenceGroupMetadata(
|
||||
request_id=str(group_id),
|
||||
is_prompt=True,
|
||||
seq_data={group_id: seq_data},
|
||||
sampling_params=sampling_params,
|
||||
block_tables=None,
|
||||
lora_request=dummy_lora_requests_per_seq[group_id] if dummy_lora_requests_per_seq else None,
|
||||
)
|
||||
seqs.append(seq)
|
||||
|
||||
# Run the model with the dummy inputs.
|
||||
num_layers = self.model_config.get_num_layers(self.parallel_config)
|
||||
kv_caches = [(None, None)] * num_layers
|
||||
self.execute_model(seqs, kv_caches)
|
||||
torch.cuda.synchronize()
|
||||
return
|
||||
147
verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
vendored
Normal file
147
verl/third_party/vllm/vllm_v_0_3_1/parallel_state.py
vendored
Normal file
@@ -0,0 +1,147 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Adapted from
|
||||
# https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/core/parallel_state.py
|
||||
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
|
||||
"""Model and data parallel groups."""
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
|
||||
import vllm.model_executor.parallel_utils.parallel_state as ps
|
||||
"""
|
||||
This version is strongly tied with Megatron to implement HybridEngine and weight sharing between vllm and Megatron.
|
||||
- We assume the Megatron tp+dp+pp world is already established before calling this function.
|
||||
|
||||
"""
|
||||
|
||||
# Tensor model parallel group that the current rank belongs to.
|
||||
_TENSOR_MODEL_PARALLEL_GROUP = None
|
||||
|
||||
# Micro Data parallel group. Micro data parallel group is additional dp group that origins from splitting training tp
|
||||
# into infer_tp and micro_tp. By default, we use order micro_dp - tp
|
||||
_MICRO_DATA_PARALLEL_GROUP = None
|
||||
|
||||
|
||||
def initialize_model_parallel_from_megatron(
|
||||
tensor_model_parallel_size=None # we set None for backward compatibility to set infer_tp = train_tp
|
||||
) -> None:
|
||||
from megatron.core import parallel_state as mpu
|
||||
from megatron.distributed import new_group
|
||||
# Get world size and rank. Ensure some consistencies.
|
||||
assert torch.distributed.is_initialized()
|
||||
|
||||
if tensor_model_parallel_size is None:
|
||||
tensor_model_parallel_size = mpu.get_tensor_model_parallel_world_size()
|
||||
else:
|
||||
assert isinstance(tensor_model_parallel_size, int)
|
||||
|
||||
# Build the tensor model-parallel groups.
|
||||
assert ps._TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
|
||||
|
||||
assert tensor_model_parallel_size <= mpu.get_tensor_model_parallel_world_size(
|
||||
), 'Not implemented for infer_tp > train_tp'
|
||||
|
||||
global _TENSOR_MODEL_PARALLEL_GROUP
|
||||
global _MICRO_DATA_PARALLEL_GROUP
|
||||
|
||||
assert mpu.get_tensor_model_parallel_world_size() % tensor_model_parallel_size == 0
|
||||
|
||||
micro_dp_size = mpu.get_tensor_model_parallel_world_size() // tensor_model_parallel_size
|
||||
|
||||
world_size: int = torch.distributed.get_world_size()
|
||||
|
||||
num_micro_dp_groups = world_size // micro_dp_size
|
||||
|
||||
rank = torch.distributed.get_rank()
|
||||
|
||||
# Build the micro dp groups.
|
||||
assert _MICRO_DATA_PARALLEL_GROUP is None, ("micro data parallel group is already initialized")
|
||||
for i in range(num_micro_dp_groups):
|
||||
ranks = range(i * micro_dp_size, (i + 1) * micro_dp_size)
|
||||
group = new_group(rank=rank, ranks=ranks, group_type='micro_dp')
|
||||
if rank in ranks:
|
||||
_MICRO_DATA_PARALLEL_GROUP = group
|
||||
|
||||
if tensor_model_parallel_size == mpu.get_tensor_model_parallel_world_size():
|
||||
# using the same tp group as Megatron
|
||||
ps._TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
|
||||
|
||||
_TENSOR_MODEL_PARALLEL_GROUP = mpu.get_tensor_model_parallel_group()
|
||||
# no _MICRO_DATA_PARALLEL_GROUP
|
||||
else:
|
||||
# initialize a micro_dp group and a tp group
|
||||
# assume training tp=4, infer tp=2, then, weight is partitioned as
|
||||
# [1], [2], [3], [4] for training and [1,2], [1,2], [3,4], [3,4] for inference
|
||||
|
||||
# Build the inference tp groups
|
||||
train_tp = mpu.get_tensor_model_parallel_world_size()
|
||||
num_tensor_model_parallel_groups_per_train_tp = train_tp // tensor_model_parallel_size
|
||||
num_tensor_model_parallel_groups = world_size // tensor_model_parallel_size
|
||||
assert _TENSOR_MODEL_PARALLEL_GROUP is None, ("tensor model parallel group is already initialized")
|
||||
for i in range(num_tensor_model_parallel_groups // num_tensor_model_parallel_groups_per_train_tp):
|
||||
start = train_tp * i
|
||||
end = train_tp * (i + 1)
|
||||
for j in range(num_tensor_model_parallel_groups_per_train_tp):
|
||||
ranks = list(range(start, end, num_tensor_model_parallel_groups_per_train_tp))
|
||||
for i in range(len(ranks)):
|
||||
ranks[i] += j
|
||||
# group = torch.distributed.new_group(ranks)
|
||||
group = new_group(rank=rank, ranks=ranks, group_type='infer_tp')
|
||||
if rank in ranks:
|
||||
_TENSOR_MODEL_PARALLEL_GROUP = group
|
||||
ps._TENSOR_MODEL_PARALLEL_GROUP = _TENSOR_MODEL_PARALLEL_GROUP
|
||||
# Build the pipeline model-parallel groups.
|
||||
# global _PIPELINE_MODEL_PARALLEL_GROUP
|
||||
# global _PIPELINE_GLOBAL_RANKS
|
||||
# assert ps._PIPELINE_MODEL_PARALLEL_GROUP is None, ("pipeline model parallel group is already initialized")
|
||||
|
||||
# ps._PIPELINE_MODEL_PARALLEL_GROUP = mpu.get_pipeline_model_parallel_group()
|
||||
# ps._PIPELINE_GLOBAL_RANKS = mpu.get_pipeline_model_parallel_ranks()
|
||||
|
||||
|
||||
"""
|
||||
Tensor model parallel utilities
|
||||
"""
|
||||
|
||||
|
||||
def get_tensor_model_parallel_group():
|
||||
"""Get the tensor model parallel group the caller rank belongs to."""
|
||||
assert _TENSOR_MODEL_PARALLEL_GROUP is not None, ("tensor model parallel group is not initialized")
|
||||
return _TENSOR_MODEL_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_tensor_model_parallel_world_size():
|
||||
"""Return world size for the tensor model parallel group."""
|
||||
return torch.distributed.get_world_size(group=get_tensor_model_parallel_group())
|
||||
|
||||
|
||||
def get_tensor_model_parallel_rank():
|
||||
"""Return my rank for the tensor model parallel group."""
|
||||
return torch.distributed.get_rank(group=get_tensor_model_parallel_group())
|
||||
|
||||
|
||||
def get_tensor_model_parallel_src_rank():
|
||||
"""Calculate the global rank corresponding to the first local rank
|
||||
in the tensor model parallel group."""
|
||||
global_rank = torch.distributed.get_rank()
|
||||
local_world_size = get_tensor_model_parallel_world_size()
|
||||
return (global_rank // local_world_size) * local_world_size
|
||||
|
||||
|
||||
"""
|
||||
Micro Data parallel group
|
||||
"""
|
||||
|
||||
|
||||
def get_micro_data_parallel_group():
|
||||
assert _MICRO_DATA_PARALLEL_GROUP is not None
|
||||
return _MICRO_DATA_PARALLEL_GROUP
|
||||
|
||||
|
||||
def get_micro_data_parallel_world_size():
|
||||
return torch.distributed.get_world_size(group=get_micro_data_parallel_group())
|
||||
|
||||
|
||||
def get_micro_data_parallel_rank():
|
||||
return torch.distributed.get_rank(group=get_micro_data_parallel_group())
|
||||
72
verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
vendored
Normal file
72
verl/third_party/vllm/vllm_v_0_3_1/tokenizer.py
vendored
Normal file
@@ -0,0 +1,72 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/transformers_utils/tokenizer_group/tokenizer_group.py
|
||||
|
||||
from typing import List, Optional, Tuple, Union
|
||||
|
||||
from transformers import (AutoTokenizer, PreTrainedTokenizer, PreTrainedTokenizerFast)
|
||||
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.utils import make_async, LRUCache
|
||||
from vllm.transformers_utils.tokenizers import *
|
||||
|
||||
|
||||
class TokenizerGroup:
|
||||
"""A group of tokenizers that can be used for LoRA adapters."""
|
||||
|
||||
def __init__(self, tokenizer: PreTrainedTokenizer, enable_lora: bool, max_num_seqs: int,
|
||||
max_input_length: Optional[int]):
|
||||
self.enable_lora = enable_lora
|
||||
self.max_input_length = max_input_length
|
||||
self.tokenizer = tokenizer
|
||||
if enable_lora:
|
||||
self.lora_tokenizers = LRUCache(capacity=max_num_seqs)
|
||||
else:
|
||||
self.lora_tokenizers = None
|
||||
|
||||
def encode(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = self.get_lora_tokenizer(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
async def encode_async(self,
|
||||
prompt: str,
|
||||
request_id: Optional[str] = None,
|
||||
lora_request: Optional[LoRARequest] = None) -> List[int]:
|
||||
tokenizer = await self.get_lora_tokenizer_async(lora_request)
|
||||
return tokenizer.encode(prompt)
|
||||
|
||||
def get_lora_tokenizer(self, lora_request: Optional[LoRARequest]) -> "PreTrainedTokenizer":
|
||||
if not lora_request or not self.enable_lora:
|
||||
return self.tokenizer
|
||||
if lora_request.lora_int_id not in self.lora_tokenizers:
|
||||
# TODO(sgm): the lora tokenizer is also passed, but may be different
|
||||
tokenizer = self.tokenizer
|
||||
# tokenizer = (get_lora_tokenizer(
|
||||
# lora_request, **self.tokenizer_config) or self.tokenizer)
|
||||
self.lora_tokenizers.put(lora_request.lora_int_id, tokenizer)
|
||||
return tokenizer
|
||||
else:
|
||||
return self.lora_tokenizers.get(lora_request.lora_int_id)
|
||||
|
||||
# FIXME(sgm): for simplicity, we assign the special token here
|
||||
@property
|
||||
def pad_token_id(self):
|
||||
return self.tokenizer.pad_token_id
|
||||
|
||||
@property
|
||||
def eos_token_id(self):
|
||||
return self.tokenizer.eos_token_id
|
||||
95
verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
vendored
Normal file
95
verl/third_party/vllm/vllm_v_0_3_1/weight_loaders.py
vendored
Normal file
@@ -0,0 +1,95 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/tree/main/vllm/model_executor/models
|
||||
|
||||
from typing import Dict
|
||||
import torch
|
||||
import torch.nn as nn
|
||||
|
||||
|
||||
# NOTE(shengguangming): replace the origin weight loader function in the class
|
||||
def parallel_weight_loader(self, param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
|
||||
"""Parallel Linear weight loader."""
|
||||
assert param.size() == loaded_weight.size(
|
||||
), 'the parameter size is not align with the loaded weight size, param size: {}, loaded_weight size: {}'.format(
|
||||
param.size(), loaded_weight.size())
|
||||
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
|
||||
|
||||
param.data = loaded_weight.data
|
||||
|
||||
|
||||
def default_weight_loader(param: torch.Tensor, loaded_weight: torch.Tensor) -> None:
|
||||
"""Default weight loader."""
|
||||
assert param.size() == loaded_weight.size()
|
||||
assert param.data.dtype == loaded_weight.data.dtype, "if we want to shared weights, the data type should also be the same"
|
||||
|
||||
param.data = loaded_weight.data
|
||||
|
||||
|
||||
def gpt2_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
params_dict = dict(vllm_model.named_parameters(remove_duplicate=False))
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if "lm_head.weight" in name:
|
||||
# GPT-2 ties the weights of the embedding layer and the final
|
||||
# linear layer.
|
||||
continue
|
||||
if ".attn.bias" in name or ".attn.masked_bias" in name:
|
||||
# Skip attention mask.
|
||||
# NOTE: "c_attn.bias" should not be skipped.
|
||||
continue
|
||||
if not name.startswith("transformer."):
|
||||
name = "transformer." + name
|
||||
param = params_dict[name]
|
||||
# The HF's GPT-2 implementation uses Conv1D instead of Linear.
|
||||
# Because of this, we need to transpose the weights.
|
||||
# Note(zhuohan): the logic below might break quantized models.
|
||||
for conv1d_weight_name in ["c_attn", "c_proj", "c_fc"]:
|
||||
if conv1d_weight_name not in name:
|
||||
continue
|
||||
if not name.endswith(".weight"):
|
||||
continue
|
||||
# TODO: check megatron
|
||||
loaded_weight = loaded_weight.t()
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def llama_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
# NOTE(shengguangming): the megatron llama may have this prefix
|
||||
prefix = '0.module.module.'
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if name[:len(prefix)] == prefix:
|
||||
name = name[len(prefix):]
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
|
||||
|
||||
def mistral_weight_loader(actor_weights: Dict, vllm_model: nn.Module) -> nn.Module:
|
||||
# TODO: need to implement a general way to deal with prefix
|
||||
prefix = '0.module.module.'
|
||||
params_dict = dict(vllm_model.named_parameters())
|
||||
for name, loaded_weight in actor_weights.items():
|
||||
if name[:len(prefix)] == prefix:
|
||||
name = name[len(prefix):]
|
||||
if "rotary_emb.inv_freq" in name:
|
||||
continue
|
||||
else:
|
||||
param = params_dict[name]
|
||||
weight_loader = getattr(param, "weight_loader", default_weight_loader)
|
||||
weight_loader(param, loaded_weight)
|
||||
314
verl/third_party/vllm/vllm_v_0_3_1/worker.py
vendored
Normal file
314
verl/third_party/vllm/vllm_v_0_3_1/worker.py
vendored
Normal file
@@ -0,0 +1,314 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
# Copyright 2023 The vLLM team.
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# Adapted from https://github.com/vllm-project/vllm/blob/main/vllm/worker/worker.py
|
||||
"""A GPU worker class."""
|
||||
import os
|
||||
import gc
|
||||
from typing import Dict, List, Tuple, Optional, Union, Set
|
||||
|
||||
import torch
|
||||
import torch.distributed
|
||||
import torch.nn as nn
|
||||
|
||||
from vllm.config import (CacheConfig, DeviceConfig, ModelConfig, ParallelConfig, SchedulerConfig, LoRAConfig)
|
||||
from vllm.model_executor import InputMetadata, set_random_seed
|
||||
from vllm.model_executor.parallel_utils.parallel_state import (initialize_model_parallel)
|
||||
from vllm.sampling_params import SamplingParams, SamplingType
|
||||
from vllm.sequence import SamplerOutput, SequenceData, SequenceGroupMetadata
|
||||
from vllm.worker.cache_engine import CacheEngine
|
||||
from vllm.model_executor.parallel_utils.custom_all_reduce import init_custom_ar
|
||||
from vllm.model_executor.parallel_utils.parallel_state import get_tensor_model_parallel_group
|
||||
|
||||
from .model_runner import ModelRunner
|
||||
from .model_loader import load_weights
|
||||
from .parallel_state import initialize_model_parallel_from_megatron
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
|
||||
class Worker:
|
||||
"""A worker class that executes (a partition of) the model on a GPU.
|
||||
|
||||
Each worker is associated with a single GPU. The worker is responsible for
|
||||
maintaining the KV cache and executing the model on the GPU. In case of
|
||||
distributed inference, each worker is assigned a partition of the model.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
model: Union[nn.Module, Dict], # model itself or its parameter dict
|
||||
model_config: ModelConfig,
|
||||
parallel_config: ParallelConfig,
|
||||
scheduler_config: SchedulerConfig,
|
||||
device_config: DeviceConfig,
|
||||
rank: Optional[int] = None,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
lora_config: Optional[LoRAConfig] = None,
|
||||
kv_cache_dtype: Optional[str] = "auto",
|
||||
) -> None:
|
||||
# self.model = model # will be replaced in the init_model
|
||||
self.model_config = model_config
|
||||
self.parallel_config = parallel_config
|
||||
self.scheduler_config = scheduler_config
|
||||
self.rank = rank
|
||||
self.distributed_init_method = distributed_init_method
|
||||
self.lora_config = lora_config
|
||||
|
||||
self.model_runner = ModelRunner(
|
||||
model,
|
||||
model_config,
|
||||
parallel_config,
|
||||
scheduler_config,
|
||||
device_config,
|
||||
lora_config=self.lora_config,
|
||||
kv_cache_dtype=kv_cache_dtype,
|
||||
)
|
||||
|
||||
# Uninitialized cache engine. Will be initialized by
|
||||
# self.init_cache_engine().
|
||||
self.cache_config = None
|
||||
self.block_size = None
|
||||
self.sliding_window = None
|
||||
self.cache_engine = None
|
||||
self.cache_events = None
|
||||
self.gpu_cache = None
|
||||
|
||||
# For offloading inference engine params
|
||||
self.cpu_model = None
|
||||
|
||||
def init_model(self, cupy_port: Optional[int] = None):
|
||||
# torch.distributed.all_reduce does not free the input tensor until
|
||||
# the synchronization point. This causes the memory usage to grow
|
||||
# as the number of all_reduce calls increases. This env var disables
|
||||
# this behavior.
|
||||
# Related issue:
|
||||
# https://discuss.pytorch.org/t/cuda-allocation-lifetime-for-inputs-to-distributed-all-reduce/191573
|
||||
os.environ["TORCH_NCCL_AVOID_RECORD_STREAMS"] = "1"
|
||||
|
||||
# Env vars will be set by TORCHRUN.
|
||||
self.rank = self.rank if self.rank is not None else int(os.getenv("RANK", "-1"))
|
||||
local_rank = int(os.getenv("LOCAL_RANK", "0"))
|
||||
self.device = torch.device(f"cuda:{local_rank}")
|
||||
if self.rank < 0:
|
||||
raise ValueError("Invalid or unspecified rank.")
|
||||
torch.cuda.set_device(self.device)
|
||||
|
||||
_check_if_gpu_supports_dtype(self.model_config.dtype)
|
||||
|
||||
# Initialize the distributed environment.
|
||||
# TODO: do not use cupy
|
||||
_init_distributed_environment(self.parallel_config, self.rank, self.distributed_init_method)
|
||||
if not self.parallel_config.disable_custom_all_reduce:
|
||||
init_custom_ar()
|
||||
# Initialize the model.
|
||||
set_random_seed(self.model_config.seed)
|
||||
# self.model = get_model(actor_model=self.model, model_config=self.model_config)
|
||||
|
||||
def load_model(self):
|
||||
self.model_runner.load_model()
|
||||
|
||||
@torch.inference_mode()
|
||||
def profile_num_available_blocks(
|
||||
self,
|
||||
block_size: int,
|
||||
gpu_memory_utilization: float,
|
||||
cpu_swap_space: int,
|
||||
cache_dtype: str,
|
||||
) -> Tuple[int, int]:
|
||||
# Profile the memory usage of the model and get the maximum number of
|
||||
# cache blocks that can be allocated with the remaining free memory.
|
||||
torch.cuda.empty_cache()
|
||||
# torch.cuda.reset_peak_memory_stats()
|
||||
|
||||
# Execute a forward pass with dummy inputs to profile the memory usage
|
||||
# of the model.
|
||||
self.model_runner.profile_run()
|
||||
|
||||
# Calculate the number of blocks that can be allocated with the
|
||||
# profiled peak memory.
|
||||
torch.cuda.synchronize()
|
||||
free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
|
||||
peak_memory = total_gpu_memory - free_gpu_memory
|
||||
|
||||
cache_block_size = CacheEngine.get_cache_block_size(block_size, cache_dtype, self.model_config,
|
||||
self.parallel_config)
|
||||
# NOTE(sgm) use the remaining memory
|
||||
num_gpu_blocks = int((free_gpu_memory * gpu_memory_utilization) // cache_block_size)
|
||||
# num_gpu_blocks = int((total_gpu_memory * gpu_memory_utilization - peak_memory) // cache_block_size)
|
||||
num_cpu_blocks = int(cpu_swap_space // cache_block_size)
|
||||
num_gpu_blocks = max(num_gpu_blocks, 0)
|
||||
num_cpu_blocks = max(num_cpu_blocks, 0)
|
||||
if self.model_runner.lora_manager:
|
||||
self.model_runner.remove_all_loras()
|
||||
gc.collect()
|
||||
torch.cuda.empty_cache()
|
||||
# Synchronize number of blocks with all the rank
|
||||
num_gpu_blocks = torch.tensor([num_gpu_blocks], device='cuda')
|
||||
num_cpu_blocks = torch.tensor([num_cpu_blocks], device='cuda')
|
||||
torch.distributed.all_reduce(num_gpu_blocks,
|
||||
op=torch.distributed.ReduceOp.MIN,
|
||||
group=get_tensor_model_parallel_group())
|
||||
torch.distributed.all_reduce(num_cpu_blocks,
|
||||
op=torch.distributed.ReduceOp.MIN,
|
||||
group=get_tensor_model_parallel_group())
|
||||
num_gpu_blocks = num_gpu_blocks.item()
|
||||
num_cpu_blocks = num_cpu_blocks.item()
|
||||
return num_gpu_blocks, num_cpu_blocks
|
||||
|
||||
def init_cache_engine(self, cache_config: CacheConfig) -> None:
|
||||
if self.cache_engine is None and self.gpu_cache is None:
|
||||
self.cache_config = cache_config
|
||||
self.cache_engine = CacheEngine(self.cache_config, self.model_config, self.parallel_config)
|
||||
self.cache_events = self.cache_engine.events
|
||||
self.gpu_cache = self.cache_engine.gpu_cache
|
||||
self.model_runner.set_block_size(self.cache_engine.block_size)
|
||||
|
||||
def free_cache_engine(self):
|
||||
# ensure `enforce_eager=True`
|
||||
self.cache_engine = None
|
||||
self.gpu_cache = None
|
||||
|
||||
def warm_up_model(self) -> None:
|
||||
if not self.model_config.enforce_eager:
|
||||
self.model_runner.capture_model(self.gpu_cache)
|
||||
# Reset the seed to ensure that the random state is not affected by
|
||||
# the model initialization and profiling.
|
||||
set_random_seed(self.model_config.seed)
|
||||
|
||||
def cache_swap(
|
||||
self,
|
||||
blocks_to_swap_in: Dict[int, int],
|
||||
blocks_to_swap_out: Dict[int, int],
|
||||
blocks_to_copy: Dict[int, List[int]],
|
||||
) -> None:
|
||||
# Issue cache operations.
|
||||
issued_cache_op = False
|
||||
if blocks_to_swap_in:
|
||||
self.cache_engine.swap_in(blocks_to_swap_in)
|
||||
issued_cache_op = True
|
||||
if blocks_to_swap_out:
|
||||
self.cache_engine.swap_out(blocks_to_swap_out)
|
||||
issued_cache_op = True
|
||||
if blocks_to_copy:
|
||||
self.cache_engine.copy(blocks_to_copy)
|
||||
issued_cache_op = True
|
||||
|
||||
cache_events = self.cache_events if issued_cache_op else None
|
||||
|
||||
# Wait for cache operations to finish.
|
||||
# TODO(woosuk): Profile swapping overhead and optimize if needed.
|
||||
if cache_events is not None:
|
||||
for event in cache_events:
|
||||
event.wait()
|
||||
|
||||
@torch.inference_mode()
|
||||
def execute_model(
|
||||
self,
|
||||
seq_group_metadata_list: List[SequenceGroupMetadata],
|
||||
blocks_to_swap_in: Dict[int, int],
|
||||
blocks_to_swap_out: Dict[int, int],
|
||||
blocks_to_copy: Dict[int, List[int]],
|
||||
) -> SamplerOutput:
|
||||
num_seq_groups = len(seq_group_metadata_list)
|
||||
self.cache_swap(blocks_to_swap_in, blocks_to_swap_out, blocks_to_copy)
|
||||
|
||||
# If there is no input, we don't need to execute the model.
|
||||
if num_seq_groups == 0:
|
||||
return {}
|
||||
output = self.model_runner.execute_model(seq_group_metadata_list, self.gpu_cache)
|
||||
return output
|
||||
|
||||
# # Prepare input tensors.
|
||||
# # NOTE(shengguangming): currently we pad in our dataloader and unpad it in pre_process_input, j
|
||||
# # we can just input un-padded sequence for better performance
|
||||
# input_tokens, input_positions, input_metadata = self._prepare_inputs(seq_group_metadata_list)
|
||||
|
||||
# # Execute the model.
|
||||
# output = self.model(
|
||||
# input_ids=input_tokens,
|
||||
# positions=input_positions,
|
||||
# kv_caches=self.gpu_cache,
|
||||
# input_metadata=input_metadata,
|
||||
# cache_events=cache_events,
|
||||
# )
|
||||
# return output
|
||||
|
||||
# assume the input is .state_dict()
|
||||
def sync_model_weights(self, actor_weights: Dict):
|
||||
load_weights(actor_weights, self.model_runner.model)
|
||||
|
||||
def offload_model_weights(self) -> None:
|
||||
if self.cpu_model == None:
|
||||
self.cpu_model = {}
|
||||
for name, params in self.model_runner.model.named_parameters():
|
||||
self.cpu_model[name] = torch.empty_like(params, device='cpu')
|
||||
params.data = self.cpu_model[name]
|
||||
else:
|
||||
for name, params in self.model_runner.model.named_parameters():
|
||||
params.data = self.cpu_model[name]
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> bool:
|
||||
return self.model_runner.add_lora(lora_request)
|
||||
|
||||
def remove_lora(self, lora_id: int) -> bool:
|
||||
return self.model_runner.remove_lora(lora_id)
|
||||
|
||||
def list_loras(self) -> Set[int]:
|
||||
return self.model_runner.list_loras()
|
||||
|
||||
|
||||
def _init_distributed_environment(
|
||||
parallel_config: ParallelConfig,
|
||||
rank: int,
|
||||
distributed_init_method: Optional[str] = None,
|
||||
) -> None:
|
||||
"""Initialize the distributed environment."""
|
||||
if torch.distributed.is_initialized():
|
||||
print('The distributed environment has been initialized before vLLM')
|
||||
elif not distributed_init_method:
|
||||
raise ValueError("distributed_init_method must be set if torch.distributed "
|
||||
"is not already initialized")
|
||||
else:
|
||||
torch.distributed.init_process_group(
|
||||
backend="nccl",
|
||||
world_size=parallel_config.world_size,
|
||||
rank=rank,
|
||||
# init_method=distributed_init_method,
|
||||
)
|
||||
|
||||
# A small all_reduce for warmup.
|
||||
torch.distributed.all_reduce(torch.zeros(1).cuda())
|
||||
# TODO (shengguangming): maybe we should also flag the megatron is initialized
|
||||
if torch.distributed.get_world_size() > 1:
|
||||
initialize_model_parallel_from_megatron(tensor_model_parallel_size=parallel_config.tensor_parallel_size)
|
||||
else:
|
||||
initialize_model_parallel()
|
||||
|
||||
|
||||
def _pad_to_alignment(x: List[int], multiple_of: int, pad: int) -> List[int]:
|
||||
return x + [pad] * ((-len(x)) % multiple_of)
|
||||
|
||||
|
||||
def _pad_to_max(x: List[int], max_len: int, pad: int) -> List[int]:
|
||||
return x + [pad] * (max_len - len(x))
|
||||
|
||||
|
||||
def _check_if_gpu_supports_dtype(torch_dtype: torch.dtype):
|
||||
# Check if the GPU supports the dtype.
|
||||
if torch_dtype == torch.bfloat16:
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
if compute_capability[0] < 8:
|
||||
gpu_name = torch.cuda.get_device_name()
|
||||
raise ValueError("Bfloat16 is only supported on GPUs with compute capability "
|
||||
f"of at least 8.0. Your {gpu_name} GPU has compute capability "
|
||||
f"{compute_capability[0]}.{compute_capability[1]}.")
|
||||
Reference in New Issue
Block a user