Co-authored-by: Simon Alibert <simon.alibert@huggingface.co>
Co-authored-by: Simon Alibert <75076266+aliberts@users.noreply.github.com>
Co-authored-by: Pablo <pablo.montalvo.leroux@gmail.com>
This commit is contained in:
Remi
2025-02-04 18:01:04 +01:00
committed by GitHub
parent dd974529cf
commit 638d411cd3
26 changed files with 2365 additions and 92 deletions

View File

@@ -8,8 +8,6 @@ import torch
@dataclass
class OptimizerConfig(draccus.ChoiceRegistry, abc.ABC):
lr: float
betas: tuple[float, float]
eps: float
weight_decay: float
grad_clip_norm: float
@@ -54,3 +52,19 @@ class AdamWConfig(OptimizerConfig):
kwargs = asdict(self)
kwargs.pop("grad_clip_norm")
return torch.optim.AdamW(params, **kwargs)
@OptimizerConfig.register_subclass("sgd")
@dataclass
class SGDConfig(OptimizerConfig):
lr: float = 1e-3
momentum: float = 0.0
dampening: float = 0.0
nesterov: bool = False
weight_decay: float = 0.0
grad_clip_norm: float = 10.0
def build(self, params: dict) -> torch.optim.Optimizer:
kwargs = asdict(self)
kwargs.pop("grad_clip_norm")
return torch.optim.SGD(params, **kwargs)

View File

@@ -54,3 +54,38 @@ class VQBeTSchedulerConfig(LRSchedulerConfig):
return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(self.num_cycles) * 2.0 * progress)))
return LambdaLR(optimizer, lr_lambda, -1)
@LRSchedulerConfig.register_subclass("cosine_decay_with_warmup")
@dataclass
class CosineDecayWithWarmupSchedulerConfig(LRSchedulerConfig):
"""Used by Physical Intelligence to train Pi0"""
num_warmup_steps: int
num_decay_steps: int
peak_lr: float
decay_lr: float
def build(self, optimizer: Optimizer, num_training_steps: int) -> LambdaLR:
del num_training_steps
def lr_lambda(current_step):
def linear_warmup_schedule(current_step):
if current_step <= 0:
return 1 / (self.num_warmup_steps + 1)
frac = 1 - current_step / self.num_warmup_steps
return (1 / (self.num_warmup_steps + 1) - 1) * frac + 1
def cosine_decay_schedule(current_step):
step = min(current_step, self.num_decay_steps)
cosine_decay = 0.5 * (1 + math.cos(math.pi * step / self.num_decay_steps))
alpha = self.decay_lr / self.peak_lr
decayed = (1 - alpha) * cosine_decay + alpha
return decayed
if current_step < self.num_warmup_steps:
return linear_warmup_schedule(current_step)
return cosine_decay_schedule(current_step)
return LambdaLR(optimizer, lr_lambda, -1)