Initial commit
This commit is contained in:
214
verl/utils/memory_buffer.py
Normal file
214
verl/utils/memory_buffer.py
Normal file
@@ -0,0 +1,214 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
This file contains utilities to manipulate torch memory buffers
|
||||
"""
|
||||
|
||||
from typing import Dict, List
|
||||
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
|
||||
class MemoryBuffer:
|
||||
"""
|
||||
A memory buffer is a contiguous torch tensor that may combine multiple tensors sharing with the underlying
|
||||
memory. It must have a unique type to support this behavior.
|
||||
"""
|
||||
|
||||
def __init__(self, numel: int, numel_padded: int, dtype: torch.dtype):
|
||||
self.numel = numel
|
||||
self.numel_padded = numel_padded
|
||||
self.dtype = dtype
|
||||
self.data = torch.zeros(self.numel_padded, dtype=self.dtype, device='cuda', requires_grad=False)
|
||||
|
||||
def zero(self):
|
||||
"""Reset the buffer to zero."""
|
||||
self.data.zero_()
|
||||
|
||||
def get(self, shape, start_index):
|
||||
"""Return a tensor with the input `shape` as a view into the
|
||||
1-D data starting at `start_index`."""
|
||||
end_index = start_index + shape.numel()
|
||||
assert end_index <= self.numel, \
|
||||
'requested tensor is out of the buffer range.'
|
||||
buffer_tensor = self.data[start_index:end_index]
|
||||
buffer_tensor = buffer_tensor.view(shape)
|
||||
return buffer_tensor
|
||||
|
||||
|
||||
def calc_padded_numel(shape: torch.Size, dtype: torch.dtype):
|
||||
"""for cuda memory alignment, make sure alignment by 128-bits"""
|
||||
align_numel = 128 // torch.finfo(dtype).bits
|
||||
numel = shape.numel()
|
||||
return (numel + align_numel - 1) // align_numel * align_numel
|
||||
|
||||
|
||||
def get_weight_buffer_meta_from_module(module: nn.Module) -> Dict[str, Dict]:
|
||||
"""
|
||||
Return a dictionary containing name to a shape and dtype.
|
||||
"""
|
||||
weight_buffer_meta = {}
|
||||
for name, param in sorted(module.named_parameters()):
|
||||
weight_buffer_meta[name] = {'shape': param.shape, 'dtype': param.dtype}
|
||||
return weight_buffer_meta
|
||||
|
||||
|
||||
def build_memory_buffer(weight_buffer_meta: Dict[str, Dict]) -> Dict[torch.dtype, MemoryBuffer]:
|
||||
"""Build the memory buffer given weight_buffer_meta
|
||||
|
||||
Args:
|
||||
weight_buffer_meta: contains mapping from name to a dictionary containing shape and dtype of the tensors
|
||||
|
||||
Returns: a large memory buffer for each dtype that can hold all the tensors
|
||||
|
||||
"""
|
||||
memory_buffers = {}
|
||||
total_numel_map = {} # map from dtype to the total numel
|
||||
for name, meta_info in sorted(weight_buffer_meta.items()):
|
||||
shape = meta_info['shape']
|
||||
dtype = meta_info['dtype']
|
||||
|
||||
assert isinstance(shape, torch.Size)
|
||||
assert isinstance(dtype, torch.dtype)
|
||||
|
||||
if dtype not in total_numel_map:
|
||||
total_numel_map[dtype] = 0
|
||||
|
||||
total_numel_map[dtype] += calc_padded_numel(shape, dtype)
|
||||
|
||||
for dtype, total_numel in total_numel_map.items():
|
||||
memory_buffers[dtype] = MemoryBuffer(total_numel, total_numel, dtype)
|
||||
|
||||
return memory_buffers
|
||||
|
||||
|
||||
def build_memory_reference_from_module(module: torch.nn.Module,
|
||||
memory_buffers: Dict[torch.dtype, MemoryBuffer],
|
||||
maintain_weight=True):
|
||||
start_index = {}
|
||||
for dtype in memory_buffers.keys():
|
||||
start_index[dtype] = 0
|
||||
for name, param in sorted(module.named_parameters()):
|
||||
memory_buffer = memory_buffers[param.dtype]
|
||||
buffer = memory_buffer.get(shape=param.shape, start_index=start_index[param.dtype])
|
||||
# need to increment start_index
|
||||
start_index[param.dtype] += calc_padded_numel(param.shape, dtype)
|
||||
if maintain_weight:
|
||||
buffer.copy_(param.data)
|
||||
param.data = buffer
|
||||
|
||||
|
||||
def build_memory_reference(weight_buffer_meta: Dict[str, Dict], memory_buffers: Dict[torch.dtype, MemoryBuffer]):
|
||||
"""Build the memory references. The memory buffers are built using the build_memory_buffer API.
|
||||
This API will allocate a weight buffer pointer to the memory buffer according to the weight_buffer_meta.
|
||||
|
||||
Args:
|
||||
weight_buffer_meta:
|
||||
memory_buffers:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
start_idx = {}
|
||||
weight_buffers = {}
|
||||
for dtype in memory_buffers.keys():
|
||||
start_idx[dtype] = 0
|
||||
|
||||
for name, meta_info in sorted(weight_buffer_meta.items()):
|
||||
shape = meta_info['shape']
|
||||
dtype = meta_info['dtype']
|
||||
|
||||
buffer = memory_buffers[dtype].get(shape, start_index=start_idx[dtype])
|
||||
start_idx[dtype] += calc_padded_numel(shape, dtype)
|
||||
weight_buffers[name] = buffer
|
||||
|
||||
return weight_buffers
|
||||
|
||||
|
||||
class MemoryBufferModuleWrapper:
|
||||
"""
|
||||
Note that we do not design MemoryBufferModuleWrapper as an nn.Module due to
|
||||
- It will change the checkpoint name
|
||||
"""
|
||||
|
||||
def __init__(self, module: nn.Module):
|
||||
super().__init__()
|
||||
self.module = module
|
||||
self.weight_buffer_meta = get_weight_buffer_meta_from_module(self.module)
|
||||
self.memory_buffers = build_memory_buffer(self.weight_buffer_meta)
|
||||
build_memory_reference_from_module(self.module, self.memory_buffers)
|
||||
|
||||
def get_memory_buffers(self):
|
||||
return self.memory_buffers
|
||||
|
||||
def get_weight_buffer_meta(self):
|
||||
return self.weight_buffer_meta
|
||||
|
||||
|
||||
class MegatronMemoryBufferForRollout(object):
|
||||
"""
|
||||
We assume that
|
||||
- inference engine has tp + dp
|
||||
- actor has tp + pp + dp
|
||||
- the tp between inference engine and actor should be the same
|
||||
- memory_buffers: contains a list of memory_buffers, each is a dict from dtype to MemoryBuffer
|
||||
- weight_buffers: contains a list of weight_buffers, each is a dict from name to param
|
||||
- named_parameters: a dict from name to parameter that normalizes the names from pp and vpp. Note that
|
||||
the named_parameters may not be directly compatible with inference engine. User has to take care of
|
||||
this part such as the layout mismatches. (e.g. qkv transpose)
|
||||
- Note that weight_buffer, named_parameters and memory_buffers share the same underlying GPU memory.
|
||||
- When doing weight sync, the data is transfer via memory buffers
|
||||
"""
|
||||
|
||||
def __init__(self, transform_memory_param_fn):
|
||||
self._memory_buffers = []
|
||||
self._weight_buffers = []
|
||||
self._named_parameters = {}
|
||||
self.transform_memory_param_fn = transform_memory_param_fn
|
||||
|
||||
def initialize_weight_buffer(self, weight_buffer_meta_pp: List[Dict[str, Dict]]):
|
||||
"""
|
||||
Initialize the weight buffer. The weight buffer is obtained according to the actor. We will construct
|
||||
a large buffer for each dtype in the weight_buffer.
|
||||
|
||||
Args:
|
||||
weight_buffer_meta: contains pp models, each pp models contains a dictionary of mapping from
|
||||
|
||||
Returns: None
|
||||
|
||||
"""
|
||||
self.weight_buffer_meta_pp = weight_buffer_meta_pp
|
||||
|
||||
for weight_buffer_meta in self.weight_buffer_meta_pp:
|
||||
memory_buffer = build_memory_buffer(weight_buffer_meta)
|
||||
self._memory_buffers.append(memory_buffer)
|
||||
self._weight_buffers.append(None)
|
||||
|
||||
def build_memory_reference(self):
|
||||
for i, weight_buffer_meta in enumerate(self.weight_buffer_meta_pp):
|
||||
self._weight_buffers[i] = build_memory_reference(weight_buffer_meta, self._memory_buffers[i])
|
||||
self._named_parameters = self.transform_memory_param_fn(self._weight_buffers)
|
||||
|
||||
@property
|
||||
def named_parameters(self):
|
||||
return self._named_parameters
|
||||
|
||||
@property
|
||||
def weight_buffers(self):
|
||||
return self._weight_buffers
|
||||
|
||||
@property
|
||||
def memory_buffers(self):
|
||||
return self._memory_buffers
|
||||
Reference in New Issue
Block a user