|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from typing import Union |
|
|
|
import math |
|
import warnings |
|
import torch |
|
from torch.optim.lr_scheduler import _LRScheduler |
|
|
|
|
|
class WarmupLR(_LRScheduler): |
|
"""The WarmupLR scheduler |
|
|
|
This scheduler is almost same as NoamLR Scheduler except for following |
|
difference: |
|
|
|
NoamLR: |
|
lr = optimizer.lr * model_size ** -0.5 |
|
* min(step ** -0.5, step * warmup_step ** -1.5) |
|
WarmupLR: |
|
lr = optimizer.lr * warmup_step ** 0.5 |
|
* min(step ** -0.5, step * warmup_step ** -1.5) |
|
|
|
Note that the maximum lr equals to optimizer.lr in this scheduler. |
|
|
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer: torch.optim.Optimizer, |
|
warmup_steps: Union[int, float] = 25000, |
|
last_epoch: int = -1, |
|
): |
|
self.warmup_steps = warmup_steps |
|
|
|
|
|
|
|
super().__init__(optimizer, last_epoch) |
|
|
|
def __repr__(self): |
|
return f"{self.__class__.__name__}(warmup_steps={self.warmup_steps})" |
|
|
|
def get_lr(self): |
|
step_num = self.last_epoch + 1 |
|
if self.warmup_steps == 0: |
|
return [lr * step_num**-0.5 for lr in self.base_lrs] |
|
else: |
|
return [ |
|
lr |
|
* self.warmup_steps**0.5 |
|
* min(step_num**-0.5, step_num * self.warmup_steps**-1.5) |
|
for lr in self.base_lrs |
|
] |
|
|
|
def set_step(self, step: int): |
|
self.last_epoch = step |
|
|
|
|
|
class WarmupPolicy(_LRScheduler): |
|
"""Adds warmup kwargs and warmup logic to lr policy. |
|
All arguments should be passed as kwargs for clarity, |
|
Args: |
|
warmup_steps: Number of training steps in warmup stage |
|
warmup_ratio: Ratio of warmup steps to total steps |
|
max_steps: Total number of steps while training or `None` for |
|
infinite training |
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
warmup_steps=None, |
|
warmup_ratio=None, |
|
max_steps=None, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
): |
|
assert not ( |
|
warmup_steps is not None and warmup_ratio is not None |
|
), "Either use particular number of step or ratio" |
|
assert ( |
|
warmup_ratio is None or max_steps is not None |
|
), "If there is a ratio, there should be a total steps" |
|
|
|
|
|
|
|
self.max_steps = max_steps |
|
if warmup_steps is not None: |
|
self.warmup_steps = warmup_steps |
|
elif warmup_ratio is not None: |
|
self.warmup_steps = int(warmup_ratio * max_steps) |
|
else: |
|
self.warmup_steps = 0 |
|
|
|
self.min_lr = min_lr |
|
super().__init__(optimizer, last_epoch) |
|
|
|
def get_lr(self): |
|
if not self._get_lr_called_within_step: |
|
warnings.warn( |
|
"To get the last learning rate computed " |
|
"by the scheduler, please use `get_last_lr()`.", |
|
UserWarning, |
|
stacklevel=2, |
|
) |
|
|
|
step = self.last_epoch |
|
|
|
if step <= self.warmup_steps and self.warmup_steps > 0: |
|
return self._get_warmup_lr(step) |
|
|
|
if step > self.max_steps: |
|
return [self.min_lr for _ in self.base_lrs] |
|
|
|
return self._get_lr(step) |
|
|
|
def _get_warmup_lr(self, step): |
|
lr_val = (step + 1) / (self.warmup_steps + 1) |
|
return [initial_lr * lr_val for initial_lr in self.base_lrs] |
|
|
|
def _get_lr(self, step): |
|
"""Simple const lr policy""" |
|
return self.base_lrs |
|
|
|
|
|
class SquareRootConstantPolicy(_LRScheduler): |
|
"""Adds warmup kwargs and warmup logic to lr policy. |
|
All arguments should be passed as kwargs for clarity, |
|
Args: |
|
warmup_steps: Number of training steps in warmup stage |
|
warmup_ratio: Ratio of warmup steps to total steps |
|
max_steps: Total number of steps while training or `None` for |
|
infinite training |
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
constant_steps=None, |
|
constant_ratio=None, |
|
max_steps=None, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
): |
|
assert not ( |
|
constant_steps is not None and constant_ratio is not None |
|
), "Either use particular number of step or ratio" |
|
assert ( |
|
constant_ratio is None or max_steps is not None |
|
), "If there is a ratio, there should be a total steps" |
|
|
|
|
|
|
|
self.max_steps = max_steps |
|
if constant_steps is not None: |
|
self.constant_steps = constant_steps |
|
elif constant_ratio is not None: |
|
self.constant_steps = int(constant_ratio * max_steps) |
|
else: |
|
self.constant_steps = 0 |
|
|
|
self.constant_lr = 1 / (constant_steps**0.5) |
|
self.min_lr = min_lr |
|
super().__init__(optimizer, last_epoch) |
|
|
|
def get_lr(self): |
|
if not self._get_lr_called_within_step: |
|
warnings.warn( |
|
"To get the last learning rate computed " |
|
"by the scheduler, please use `get_last_lr()`.", |
|
UserWarning, |
|
stacklevel=2, |
|
) |
|
|
|
step = self.last_epoch |
|
|
|
if step <= self.constant_steps: |
|
return [self.constant_lr for _ in self.base_lrs] |
|
|
|
if step > self.max_steps: |
|
return [self.min_lr for _ in self.base_lrs] |
|
|
|
return self._get_lr(step) |
|
|
|
def _get_lr(self, step): |
|
"""Simple const lr policy""" |
|
return self.base_lrs |
|
|
|
|
|
class WarmupHoldPolicy(WarmupPolicy): |
|
"""Variant of WarmupPolicy which maintains high |
|
learning rate for a defined number of steps. |
|
All arguments should be passed as kwargs for clarity, |
|
Args: |
|
warmup_steps: Number of training steps in warmup stage |
|
warmup_ratio: Ratio of warmup steps to total steps |
|
hold_steps: Number of training steps to |
|
hold the learning rate after warm up |
|
hold_ratio: Ratio of hold steps to total steps |
|
max_steps: Total number of steps while training or `None` for |
|
infinite training |
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
warmup_steps=None, |
|
warmup_ratio=None, |
|
hold_steps=None, |
|
hold_ratio=None, |
|
max_steps=None, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
): |
|
assert not ( |
|
hold_steps is not None and hold_ratio is not None |
|
), "Either use particular number of step or ratio" |
|
assert ( |
|
hold_ratio is None or max_steps is not None |
|
), "If there is a ratio, there should be a total steps" |
|
|
|
self.min_lr = min_lr |
|
self._last_warmup_lr = 0.0 |
|
|
|
|
|
self.max_steps = max_steps |
|
if warmup_steps is not None: |
|
self.warmup_steps = warmup_steps |
|
elif warmup_ratio is not None: |
|
self.warmup_steps = int(warmup_ratio * max_steps) |
|
else: |
|
self.warmup_steps = 0 |
|
|
|
if hold_steps is not None: |
|
self.hold_steps = hold_steps + self.warmup_steps |
|
elif hold_ratio is not None: |
|
self.hold_steps = int(hold_ratio * max_steps) + self.warmup_steps |
|
else: |
|
self.hold_steps = 0 |
|
|
|
super().__init__( |
|
optimizer, |
|
warmup_steps=warmup_steps, |
|
warmup_ratio=warmup_ratio, |
|
max_steps=max_steps, |
|
last_epoch=last_epoch, |
|
min_lr=min_lr, |
|
) |
|
|
|
def get_lr(self): |
|
if not self._get_lr_called_within_step: |
|
warnings.warn( |
|
"To get the last learning rate computed by the scheduler," |
|
" " |
|
"please use `get_last_lr()`.", |
|
UserWarning, |
|
stacklevel=2, |
|
) |
|
|
|
step = self.last_epoch |
|
|
|
|
|
if step <= self.warmup_steps and self.warmup_steps > 0: |
|
return self._get_warmup_lr(step) |
|
|
|
|
|
if (step >= self.warmup_steps) and (step < self.hold_steps): |
|
return self.base_lrs |
|
|
|
if step > self.max_steps: |
|
return [self.min_lr for _ in self.base_lrs] |
|
|
|
return self._get_lr(step) |
|
|
|
|
|
class WarmupAnnealHoldPolicy(_LRScheduler): |
|
"""Adds warmup kwargs and warmup logic to lr policy. |
|
All arguments should be passed as kwargs for clarity, |
|
Args: |
|
warmup_steps: Number of training steps in warmup stage |
|
warmup_ratio: Ratio of warmup steps to total steps |
|
max_steps: Total number of steps while training or `None` for |
|
infinite training |
|
min_lr: Minimum lr to hold the learning rate after decay at. |
|
constant_steps: Number of steps to keep lr constant at. |
|
constant_ratio: Ratio of steps to keep lr constant. |
|
""" |
|
|
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
warmup_steps=None, |
|
warmup_ratio=None, |
|
constant_steps=None, |
|
constant_ratio=None, |
|
max_steps=None, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
): |
|
assert not ( |
|
warmup_steps is not None and warmup_ratio is not None |
|
), "Either use particular number of step or ratio" |
|
assert not ( |
|
constant_steps is not None and constant_ratio is not None |
|
), "Either use constant_steps or constant_ratio" |
|
assert ( |
|
warmup_ratio is None or max_steps is not None |
|
), "If there is a ratio, there should be a total steps" |
|
|
|
|
|
|
|
self.max_steps = max_steps |
|
|
|
if warmup_steps is not None: |
|
self.warmup_steps = warmup_steps |
|
elif warmup_ratio is not None: |
|
self.warmup_steps = int(warmup_ratio * max_steps) |
|
else: |
|
self.warmup_steps = 0 |
|
|
|
if constant_steps is not None: |
|
self.constant_steps = constant_steps |
|
elif constant_ratio is not None: |
|
self.constant_steps = int(constant_ratio * max_steps) |
|
else: |
|
self.constant_steps = 0 |
|
|
|
self.decay_steps = max_steps - (self.constant_steps + self.warmup_steps) |
|
|
|
self.min_lr = min_lr |
|
super().__init__(optimizer, last_epoch) |
|
|
|
def get_lr(self): |
|
if not self._get_lr_called_within_step: |
|
warnings.warn( |
|
"To get the last learning rate computed " |
|
"by the scheduler, please use `get_last_lr()`.", |
|
UserWarning, |
|
stacklevel=2, |
|
) |
|
|
|
step = self.last_epoch |
|
|
|
|
|
if self.warmup_steps > 0 and step <= self.warmup_steps: |
|
return self._get_warmup_lr(step) |
|
|
|
|
|
if ( |
|
self.constant_steps > 0 |
|
and (self.warmup_steps + self.decay_steps) < step <= self.max_steps |
|
): |
|
return self._get_constant_lr(step) |
|
|
|
|
|
if step > self.max_steps: |
|
return [self.min_lr for _ in self.base_lrs] |
|
|
|
return self._get_lr(step) |
|
|
|
def _get_warmup_lr(self, step): |
|
lr_val = (step + 1) / (self.warmup_steps + 1) |
|
return [initial_lr * lr_val for initial_lr in self.base_lrs] |
|
|
|
def _get_constant_lr(self, step): |
|
return [self.min_lr for _ in self.base_lrs] |
|
|
|
def _get_lr(self, step): |
|
"""Simple const lr policy""" |
|
return self.base_lrs |
|
|
|
|
|
def _squareroot_annealing(initial_lr, step, max_steps, min_lr): |
|
mult = ((max_steps - step) / max_steps) ** 0.5 |
|
out_lr = initial_lr * mult |
|
out_lr = max(out_lr, min_lr) |
|
return out_lr |
|
|
|
|
|
def _square_annealing(initial_lr, step, max_steps, min_lr): |
|
mult = ((max_steps - step) / max_steps) ** 2 |
|
out_lr = initial_lr * mult |
|
out_lr = max(out_lr, min_lr) |
|
return out_lr |
|
|
|
|
|
def _cosine_annealing(initial_lr, step, max_steps, min_lr): |
|
mult = 0.5 * (1 + math.cos(math.pi * step / max_steps)) |
|
out_lr = (initial_lr - min_lr) * mult + min_lr |
|
return out_lr |
|
|
|
|
|
def _linear_warmup_with_cosine_annealing( |
|
max_lr, warmup_steps, step, decay_steps, min_lr |
|
): |
|
assert max_lr > min_lr |
|
|
|
if warmup_steps > 0 and step <= warmup_steps: |
|
return max_lr * float(step) / float(warmup_steps) |
|
|
|
|
|
if step > warmup_steps + decay_steps: |
|
return min_lr |
|
|
|
|
|
num_steps_ = step - warmup_steps |
|
decay_steps_ = decay_steps |
|
decay_ratio = float(num_steps_) / float(decay_steps_) |
|
assert decay_ratio >= 0.0 |
|
assert decay_ratio <= 1.0 |
|
delta_lr = max_lr - min_lr |
|
|
|
coeff = 0.5 * (math.cos(math.pi * decay_ratio) + 1.0) |
|
|
|
return min_lr + coeff * delta_lr |
|
|
|
|
|
def _poly_decay(initial_lr, step, decay_steps, power, min_lr, cycle): |
|
if cycle: |
|
multiplier = 1.0 if step == 0 else math.ceil(step / decay_steps) |
|
decay_steps *= multiplier |
|
else: |
|
step = min(step, decay_steps) |
|
p = step / decay_steps |
|
lr = (initial_lr - min_lr) * math.pow(1.0 - p, power) |
|
lr += min_lr |
|
return lr |
|
|
|
|
|
def _noam_hold_annealing( |
|
initial_lr, step, warmup_steps, hold_steps, decay_rate, min_lr |
|
): |
|
|
|
|
|
T_warmup_decay = max(1, warmup_steps**decay_rate) |
|
T_hold_decay = max(1, (step - hold_steps) ** decay_rate) |
|
lr = (initial_lr * T_warmup_decay) / T_hold_decay |
|
lr = max(lr, min_lr) |
|
return lr |
|
|
|
|
|
class SquareAnnealing(WarmupPolicy): |
|
def __init__(self, optimizer, *, max_steps, min_lr=1e-5, last_epoch=-1, **kwargs): |
|
super().__init__( |
|
optimizer=optimizer, |
|
max_steps=max_steps, |
|
last_epoch=last_epoch, |
|
min_lr=min_lr, |
|
**kwargs, |
|
) |
|
|
|
def _get_lr(self, step): |
|
new_lrs = [ |
|
_square_annealing( |
|
initial_lr=initial_lr, |
|
step=step - self.warmup_steps, |
|
max_steps=self.max_steps - self.warmup_steps, |
|
min_lr=self.min_lr, |
|
) |
|
for initial_lr in self.base_lrs |
|
] |
|
return new_lrs |
|
|
|
|
|
class SquareRootAnnealing(WarmupPolicy): |
|
def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs): |
|
super().__init__( |
|
optimizer=optimizer, |
|
max_steps=max_steps, |
|
last_epoch=last_epoch, |
|
min_lr=min_lr, |
|
**kwargs, |
|
) |
|
|
|
def _get_lr(self, step): |
|
new_lrs = [ |
|
_squareroot_annealing( |
|
initial_lr=initial_lr, |
|
step=step, |
|
max_steps=self.max_steps, |
|
min_lr=self.min_lr, |
|
) |
|
for initial_lr in self.base_lrs |
|
] |
|
return new_lrs |
|
|
|
|
|
class CosineAnnealing(WarmupAnnealHoldPolicy): |
|
def __init__(self, optimizer, *, max_steps, min_lr=0, last_epoch=-1, **kwargs): |
|
super().__init__( |
|
optimizer=optimizer, |
|
max_steps=max_steps, |
|
last_epoch=last_epoch, |
|
min_lr=min_lr, |
|
**kwargs, |
|
) |
|
|
|
def _get_lr(self, step): |
|
for initial_lr in self.base_lrs: |
|
if initial_lr < self.min_lr: |
|
raise ValueError( |
|
f"{self} received an initial learning rate " |
|
f"that was lower than the minimum learning rate." |
|
) |
|
|
|
if self.constant_steps is None or self.constant_steps == 0: |
|
new_lrs = [ |
|
_cosine_annealing( |
|
initial_lr=initial_lr, |
|
step=step - self.warmup_steps, |
|
max_steps=self.max_steps - self.warmup_steps, |
|
min_lr=self.min_lr, |
|
) |
|
for initial_lr in self.base_lrs |
|
] |
|
else: |
|
new_lrs = self._get_linear_warmup_with_cosine_annealing_lr(step) |
|
return new_lrs |
|
|
|
def _get_warmup_lr(self, step): |
|
if self.constant_steps is None or self.constant_steps == 0: |
|
return super()._get_warmup_lr(step) |
|
else: |
|
|
|
return self._get_linear_warmup_with_cosine_annealing_lr(step) |
|
|
|
def _get_constant_lr(self, step): |
|
|
|
return self._get_linear_warmup_with_cosine_annealing_lr(step) |
|
|
|
def _get_linear_warmup_with_cosine_annealing_lr(self, step): |
|
|
|
|
|
new_lrs = [ |
|
_linear_warmup_with_cosine_annealing( |
|
max_lr=self.base_lrs[0], |
|
warmup_steps=self.warmup_steps, |
|
step=step, |
|
decay_steps=self.decay_steps, |
|
min_lr=self.min_lr, |
|
) |
|
for _ in self.base_lrs |
|
] |
|
return new_lrs |
|
|
|
|
|
class NoamAnnealing(_LRScheduler): |
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
d_model, |
|
warmup_steps=None, |
|
warmup_ratio=None, |
|
max_steps=None, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
): |
|
self._normalize = d_model ** (-0.5) |
|
assert not ( |
|
warmup_steps is not None and warmup_ratio is not None |
|
), "Either use particular number of step or ratio" |
|
assert ( |
|
warmup_ratio is None or max_steps is not None |
|
), "If there is a ratio, there should be a total steps" |
|
|
|
|
|
|
|
self.max_steps = max_steps |
|
if warmup_steps is not None: |
|
self.warmup_steps = warmup_steps |
|
elif warmup_ratio is not None: |
|
self.warmup_steps = int(warmup_ratio * max_steps) |
|
else: |
|
self.warmup_steps = 0 |
|
|
|
self.min_lr = min_lr |
|
super().__init__(optimizer, last_epoch) |
|
|
|
def get_lr(self): |
|
if not self._get_lr_called_within_step: |
|
warnings.warn( |
|
"To get the last learning rate computed " |
|
"by the scheduler, please use `get_last_lr()`.", |
|
UserWarning, |
|
stacklevel=2, |
|
) |
|
|
|
step = max(1, self.last_epoch) |
|
|
|
for initial_lr in self.base_lrs: |
|
if initial_lr < self.min_lr: |
|
raise ValueError( |
|
f"{self} received an initial learning rate " |
|
f"that was lower than the minimum learning rate." |
|
) |
|
|
|
new_lrs = [ |
|
self._noam_annealing(initial_lr=initial_lr, step=step) |
|
for initial_lr in self.base_lrs |
|
] |
|
return new_lrs |
|
|
|
def _noam_annealing(self, initial_lr, step): |
|
if self.warmup_steps > 0: |
|
mult = self._normalize * min( |
|
step ** (-0.5), step * (self.warmup_steps ** (-1.5)) |
|
) |
|
else: |
|
mult = self._normalize * step ** (-0.5) |
|
|
|
out_lr = initial_lr * mult |
|
if step > self.warmup_steps: |
|
out_lr = max(out_lr, self.min_lr) |
|
return out_lr |
|
|
|
|
|
class NoamHoldAnnealing(WarmupHoldPolicy): |
|
def __init__( |
|
self, |
|
optimizer, |
|
*, |
|
max_steps, |
|
decay_rate=0.5, |
|
min_lr=0.0, |
|
last_epoch=-1, |
|
**kwargs, |
|
): |
|
""" |
|
From Nemo: |
|
Implementation of the Noam Hold Annealing policy |
|
from the SqueezeFormer paper. |
|
|
|
Unlike NoamAnnealing, the peak learning rate |
|
can be explicitly set for this scheduler. |
|
The schedule first performs linear warmup, |
|
then holds the peak LR, then decays with some schedule for |
|
the remainder of the steps. |
|
Therefore the min-lr is still dependent |
|
on the hyper parameters selected. |
|
|
|
It's schedule is determined by three factors- |
|
|
|
Warmup Steps: Initial stage, where linear warmup |
|
occurs uptil the peak LR is reached. Unlike NoamAnnealing, |
|
the peak LR is explicitly stated here instead of a scaling factor. |
|
|
|
Hold Steps: Intermediate stage, where the peak LR |
|
is maintained for some number of steps. In this region, |
|
the high peak LR allows the model to converge faster |
|
if training is stable. However the high LR |
|
may also cause instability during training. |
|
Should usually be a significant fraction of training |
|
steps (around 30-40% of the entire training steps). |
|
|
|
Decay Steps: Final stage, where the LR rapidly decays |
|
with some scaling rate (set by decay rate). |
|
To attain Noam decay, use 0.5, |
|
for Squeezeformer recommended decay, use 1.0. |
|
The fast decay after prolonged high LR during |
|
hold phase allows for rapid convergence. |
|
|
|
References: |
|
- [Squeezeformer: |
|
An Efficient Transformer for Automatic Speech Recognition] |
|
(https://arxiv.org/abs/2206.00888) |
|
|
|
Args: |
|
optimizer: Pytorch compatible Optimizer object. |
|
warmup_steps: Number of training steps in warmup stage |
|
warmup_ratio: Ratio of warmup steps to total steps |
|
hold_steps: Number of training steps to |
|
hold the learning rate after warm up |
|
hold_ratio: Ratio of hold steps to total steps |
|
max_steps: Total number of steps while training or `None` for |
|
infinite training |
|
decay_rate: Float value describing the polynomial decay |
|
after the hold period. Default value |
|
of 0.5 corresponds to Noam decay. |
|
min_lr: Minimum learning rate. |
|
""" |
|
self.decay_rate = decay_rate |
|
super().__init__( |
|
optimizer=optimizer, |
|
max_steps=max_steps, |
|
last_epoch=last_epoch, |
|
min_lr=min_lr, |
|
**kwargs, |
|
) |
|
|
|
def _get_lr(self, step): |
|
if self.warmup_steps is None or self.warmup_steps == 0: |
|
raise ValueError("Noam scheduler cannot be used without warmup steps") |
|
|
|
if self.hold_steps > 0: |
|
hold_steps = self.hold_steps - self.warmup_steps |
|
else: |
|
hold_steps = 0 |
|
|
|
new_lrs = [ |
|
_noam_hold_annealing( |
|
initial_lr, |
|
step=step, |
|
warmup_steps=self.warmup_steps, |
|
hold_steps=hold_steps, |
|
decay_rate=self.decay_rate, |
|
min_lr=self.min_lr, |
|
) |
|
for initial_lr in self.base_lrs |
|
] |
|
return new_lrs |
|
|
|
def set_step(self, step: int): |
|
self.last_epoch = step |
|
|