Initial commit
This commit is contained in:
162
verl/workers/rollout/tokenizer.py
Normal file
162
verl/workers/rollout/tokenizer.py
Normal file
@@ -0,0 +1,162 @@
|
||||
# Copyright 2024 Bytedance Ltd. and/or its affiliates
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
"""
|
||||
The base tokenizer class, required for any hybrid engine based rollout or inference with vLLM.
|
||||
"""
|
||||
from abc import ABC, abstractmethod
|
||||
from typing import Dict, List, Union
|
||||
|
||||
__all__ = ['HybridEngineBaseTokenizer']
|
||||
|
||||
|
||||
class HybridEngineBaseTokenizer(ABC):
|
||||
"""the tokenizer property and function name should align with HF's to meet vllm requirement"""
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def vocab_size(self):
|
||||
"""
|
||||
`int`: Size of the base vocabulary (without the added tokens).
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def pad_token_id(self):
|
||||
"""
|
||||
`Optional[int]`: Id of the padding token in the vocabulary. Returns `None` if the token has not been set.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def eos_token_id(self):
|
||||
"""
|
||||
`Optional[int]`: Id of the end of sentence token in the vocabulary. Returns `None` if the token has not been
|
||||
set.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_ids(self) -> List[int]:
|
||||
"""
|
||||
`List[int]`: List the ids of the special tokens(`'<unk>'`, `'<cls>'`, etc.) mapped to class attributes.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
@abstractmethod
|
||||
def all_special_tokens(self) -> List[str]:
|
||||
"""
|
||||
`List[str]`: A list of the unique special tokens (`'<unk>'`, `'<cls>'`, ..., etc.).
|
||||
|
||||
Convert tokens of `tokenizers.AddedToken` type to string.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def encode(self, text):
|
||||
"""
|
||||
Converts a string to a sequence of ids (integer), using the tokenizer and vocabulary.
|
||||
|
||||
Args:
|
||||
text (`str`, `List[str]` or `List[int]`):
|
||||
The first sequence to be encoded. This can be a string, a list of strings (tokenized string using the
|
||||
`tokenize` method) or a list of integers.
|
||||
|
||||
text_pair (`str`, `List[str]` or `List[int]`, *optional*):
|
||||
Optional second sequence to be encoded. This can be a string, a list of strings (tokenized string using
|
||||
the `tokenize` method) or a list of integers.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def decode(
|
||||
self,
|
||||
token_ids: Union[int, List[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
|
||||
skip_special_tokens: bool = False,
|
||||
clean_up_tokenization_spaces: bool = None,
|
||||
**kwargs,
|
||||
) -> str:
|
||||
"""
|
||||
Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
|
||||
tokens and clean up tokenization spaces.
|
||||
|
||||
Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
|
||||
|
||||
Args:
|
||||
token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
|
||||
List of tokenized input ids. Can be obtained using the `__call__` method.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
clean_up_tokenization_spaces (`bool`, *optional*):
|
||||
Whether or not to clean up the tokenization spaces. If `None`, will default to
|
||||
`self.clean_up_tokenization_spaces`.
|
||||
kwargs (additional keyword arguments, *optional*):
|
||||
Will be passed to the underlying model specific decode method.
|
||||
|
||||
Returns:
|
||||
`str`: The decoded sentence.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def convert_ids_to_tokens(self,
|
||||
ids: Union[int, List[int]],
|
||||
skip_special_tokens: bool = False) -> Union[str, List[str]]:
|
||||
"""
|
||||
Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
|
||||
added tokens.
|
||||
|
||||
Args:
|
||||
ids (`int` or `List[int]`):
|
||||
The token id (or token ids) to convert to tokens.
|
||||
skip_special_tokens (`bool`, *optional*, defaults to `False`):
|
||||
Whether or not to remove special tokens in the decoding.
|
||||
|
||||
Returns:
|
||||
`str` or `List[str]`: The decoded token(s).
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def get_added_vocab(self) -> Dict[str, int]:
|
||||
"""
|
||||
Returns the added tokens in the vocabulary as a dictionary of token to index. Results might be different from
|
||||
the fast call because for now we always add the tokens even if they are already in the vocabulary. This is
|
||||
something we should change.
|
||||
|
||||
Returns:
|
||||
`Dict[str, int]`: The added tokens.
|
||||
"""
|
||||
pass
|
||||
|
||||
@abstractmethod
|
||||
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
||||
"""
|
||||
Converts a sequence of tokens in a single string. The most simple way to do it is `" ".join(tokens)` but we
|
||||
often want to remove sub-word tokenization artifacts at the same time.
|
||||
|
||||
Args:
|
||||
tokens (`List[str]`): The token to join in a string.
|
||||
|
||||
Returns:
|
||||
`str`: The joined tokens.
|
||||
"""
|
||||
pass
|
||||
|
||||
@property
|
||||
def is_fast(self):
|
||||
return False
|
||||
Reference in New Issue
Block a user