Source code for mlbench_core.lr_scheduler.pytorch.lr

# -*- coding: utf-8 -*-

import math
from bisect import bisect_right

from torch.optim.lr_scheduler import LambdaLR, ReduceLROnPlateau


[docs]class LRLinearWarmUp(LambdaLR): """Applies linear warmup to learning rate. At the first iteration, lr will be `initial_lr`, and will linearly increase to `scaled_lr` at iteration `warmup_duration + 1` (i.e `warmup_duration` steps of warm-up) In :cite:`goyal2017accurate`, warmup is used in order to apply the ``Linear Scaling Rule``. Starting from the ``base_lr``, lr gradually increases to ``base_lr * scaling_factor``. Args: init_lr (float): Initial LR at beginning of warmup scaled_lr (float): LR at end of warmup warmup_duration (float): Duration of warmup """ def __init__(self, optimizer, init_lr, scaled_lr, warmup_duration): self.warmup_duration = warmup_duration self.scaled_lr = scaled_lr self.init_lr = init_lr self.optimizer = optimizer # overwrite initial lr for group in self.optimizer.param_groups: group["initial_lr"] = self.scaled_lr group["lr"] = self.scaled_lr super().__init__(self.optimizer, self.f) def f(self, duration): factor = 1 if self.warmup_duration > 0 and duration <= self.warmup_duration: progress = duration / self.warmup_duration factor = progress + ((1 - progress) * self.init_lr) / self.scaled_lr return factor @property def duration(self): return self.warmup_duration
[docs]class MultiStepLRLinearWarmUp(LambdaLR): """Multi-step Learning rate Scheduler with Linear Warm-up Period Args: optimizer (:obj:`torch.optim.Optimizer`): an optimizer for the given model. gamma (float): Decay factor for learning rate milestones (:obj:`list` of :obj:`int`): The epochs/steps at which to reduce the learning rate scaled_lr (float): The LR to reach after warmup warmup_init_lr (float): The initial learning rate to use for the warmup epochs. Default: 0 warmup_duration (int): The number of epochs to perform warmup before regular lr scaling starts. Default: 0 """ def __init__( self, optimizer, gamma, milestones, scaled_lr, warmup_init_lr=0, warmup_duration=0, ): if list(milestones) != sorted(milestones): raise ValueError( "Milestones should be a list of increasing integers." "Got {}".format(milestones) ) if warmup_duration >= milestones[0]: raise ValueError( "The scaling phase should be earlier than the first milestone." "Got {} and {}".format(warmup_duration, milestones[0]) ) self.gamma = gamma self.milestones = milestones self.warmup = LRLinearWarmUp( optimizer=optimizer, init_lr=warmup_init_lr, scaled_lr=scaled_lr, warmup_duration=warmup_duration, ) super(MultiStepLRLinearWarmUp, self).__init__(optimizer, self.f) def f(self, duration): # warmup_lr => lr or lr * world_size => .... if duration <= self.warmup.duration: factor = self.warmup.f(duration) else: factor = self.gamma ** bisect_right(self.milestones, duration) return factor
[docs]class ReduceLROnPlateauWithWarmup(ReduceLROnPlateau): """ReduceLROnPlateau but with a linear warm-up period. Args: optimizer (:obj:`torch.optim.Optimizer`): an optimizer for the given model. warmup_init_lr (float): LR at beginning of warm-up scaled_lr (float): LR at end of warm-up warmup_epochs (int): Number of epochs for warm-up batches_per_epoch (int, optional): Number of batches per epoch if we want a warm-up per batch **kwargs: Arguments for ReduceLROnPlateau """ def __init__( self, optimizer, warmup_init_lr, scaled_lr, warmup_epochs, batches_per_epoch=None, **kwargs ): self.warmup_epochs = warmup_epochs self.warmup_duration = warmup_epochs * ( batches_per_epoch or 1 ) # To get finer warmup self.warmup_init_lr = warmup_init_lr self.scaled_lr = scaled_lr self.optimizer = optimizer self.batch_idx = 0 self.finished_warmup = warmup_epochs <= 0 # If no warmup self.base_lr = scaled_lr if self.finished_warmup else warmup_init_lr self._set_lr(self.base_lr) super(ReduceLROnPlateauWithWarmup, self).__init__(optimizer, **kwargs)
[docs] def batch_step(self): """Function to call when the warm-up is per batch. This function will change the learning rate to `` progress = batch_idx / warmup_duration new_lr = progress * scaled_lr + (1 - progress) * warmup_init_lr `` """ if self.batch_idx >= self.warmup_duration: return else: self.batch_idx += 1 progress = self.batch_idx / self.warmup_duration new_lr = progress * self.scaled_lr + (1 - progress) * self.warmup_init_lr self._set_lr(new_lr) # Check if warmup done self.finished_warmup = ( self.finished_warmup or self.batch_idx == self.warmup_duration
)
[docs] def step(self, metrics, epoch=None): """Scheduler step at end of epoch. This function will pass the arguments to ReduceLROnPlateau if the warmup is done, and call `self.batch_step` if the warm-up is per epoch, to update the LR. Args: metrics (float): Current loss """ if self.finished_warmup: # Reduce only if we finished warmup super(ReduceLROnPlateauWithWarmup, self).step(metrics, epoch=None) else: # Still in warmup if epoch is not None: raise ValueError("Epoch argument must be none") self.last_epoch += 1 # This means the warm-up is per epoch not batch, so we need to update it if ( self.warmup_epochs > 0 and self.warmup_epochs == self.warmup_duration ): # warmup per epoch self.batch_step()
def _set_lr(self, new_lr): for param_group in self.optimizer.param_groups: param_group["lr"] = new_lr
[docs]class SparsifiedSGDLR(LambdaLR): """Learning rate schedule for sparsifiedSGD (gamma / l2_coef * (t + shifting_param)) Args: optimizer (:obj:`torch.optim.Optimizer`): an optimizer for the given model. gamma (float): The constant value in the numerator of the learning rate schedule formula l2_coef (float): The regularization rate which is used in the denominator of the learning rate schedule formula shifting_param (float): The constant value in the denominator of the learning rate schedule formula """ def __init__(self, optimizer, gamma, l2_coef, shifting_param): self.shifting_param = shifting_param self.optimizer = optimizer for group in self.optimizer.param_groups: group["initial_lr"] = gamma / l2_coef self.optimizer.base_lrs = [gamma / l2_coef for _ in self.optimizer.param_groups] super(SparsifiedSGDLR, self).__init__(self.optimizer, self.f) def f(self, iteration): return 1 / max(1, (self.shifting_param + iteration))
[docs]class TimeDecayLR(LambdaLR): """ Time based decay learning rate schedule for SGD (alpha / (t + beta)) Args: optimizer (:obj:`torch.optim.Optimizer`): an optimizer for the given model. beta (float): The constant value in the denominator of the learning rate schedule formula Returns: A learning rate scheduler (:obj:`torch.optim.lr_scheduler.LambdaLR`) """ def __init__(self, optimizer, beta): self.beta = beta super(TimeDecayLR, self).__init__(optimizer, self.f) def f(self, iteration): return 1 / (self.beta + iteration)
[docs]class SQRTTimeDecayLR(LambdaLR): """ Time based decay learning rate schedule for SGD (alpha / sqrt(t)) Returns: A learning rate scheduler (:obj:`torch.optim.lr_scheduler.LambdaLR`) """ def __init__(self, optimizer): super(SQRTTimeDecayLR, self).__init__(optimizer, self.f) def f(self, iteration): return 1.0 / math.sqrt(max(1, iteration))
[docs]class ExponentialWarmupMultiStepLR(LambdaLR): """ Learning rate scheduler with exponential warmup and step decay. Parameters: warmup_steps, remain_steps and decay_interval accept both integers and floats as an input. Integer input is interpreted as absolute index of iteration, float input is interpreted as a fraction of total training iterations (epochs * steps_per_epoch). If decay_interval is None then the decay will happen at regulary spaced intervals ('decay_steps' decays between iteration indices 'remain_steps' and 'iterations'). Args: optimizer: instance of optimizer iterations (int): total number of training iterations warmup_steps (int): number of warmup iterations remain_steps (int|float): start decay at 'remain_steps' iteration decay_interval (int|float): interval between LR decay steps decay_steps (int): max number of decay steps decay_factor (float): decay factor """ def __init__( self, optimizer, iterations, warmup_steps=0, remain_steps=1.0, decay_interval=None, decay_steps=4, decay_factor=0.5, ): # iterations before learning rate reaches base LR self.warmup_steps = self.convert_relative_stepsize(warmup_steps, iterations) # iteration at which decay starts self.remain_steps = self.convert_relative_stepsize(remain_steps, iterations) # number of steps between each decay if decay_interval is None: # decay at regulary spaced intervals decay_iterations = iterations - self.remain_steps self.decay_interval = decay_iterations // decay_steps self.decay_interval = max(self.decay_interval, 1) else: self.decay_interval = self.convert_relative_stepsize( decay_interval, iterations ) # multiplicative decay factor self.decay_factor = decay_factor # max number of decay steps self.decay_steps = decay_steps if self.warmup_steps > self.remain_steps: self.warmup_steps = self.remain_steps super(ExponentialWarmupMultiStepLR, self).__init__(optimizer, self.f) @staticmethod def convert_relative_stepsize(param, total): if isinstance(param, float): param = int(param * total) return param def f(self, duration): factor = 1 if duration <= self.warmup_steps: # exponential lr warmup if self.warmup_steps != 0: warmup_factor = math.exp(math.log(0.01) / self.warmup_steps) else: warmup_factor = 1.0 factor = warmup_factor ** (self.warmup_steps - self.last_epoch) elif self.last_epoch >= self.remain_steps: # step decay decay_iter = self.last_epoch - self.remain_steps num_decay_steps = decay_iter // self.decay_interval + 1 num_decay_steps = min(num_decay_steps, self.decay_steps) factor = self.decay_factor ** num_decay_steps return factor
[docs]class SQRTTimeDecayLRWithWarmup(LambdaLR): """SQRT learning rate scheduler with Linear warm-up steps During warmup: ``` lrs = torch.linspace(warmup_init_lr, base_lr, warmup_steps) lr = lrs[update_num] ``` After warmup: ``` lr = base_lr * decay_factor ``` where ```decay_factor = sqrt(warmup_steps / current_iteration)``` Args: optimizer (:obj:`torch.optim`): The optimizer base_lr (float): The base LR after warm-up warmup_init_lr (float): LR at start of training warmup_steps (int): Number of warm-up steps """ def __init__(self, optimizer, base_lr, warmup_init_lr, warmup_steps): self.warmup = LRLinearWarmUp( optimizer=optimizer, init_lr=warmup_init_lr, scaled_lr=base_lr, warmup_duration=warmup_steps, ) super(SQRTTimeDecayLRWithWarmup, self).__init__(optimizer, self.f) def f(self, iteration): # Warmup if iteration <= self.warmup.duration: factor = self.warmup.f(iteration) else: factor = (self.warmup.duration / iteration) ** 0.5 return factor
class PolyDecayLRLinearWarmup(LambdaLR): """Polynomial decay of learning rate with linear warmup. During warmup: ``` lrs = torch.linspace(warmup_init_lr, base_lr, warmup_steps) lr = lrs[update_num] ``` After warmup: ``` lr = base_lr * decay_factor ``` where ```decay_factor = (1 - ((iteration - warmup_duration) / decay_steps))^pow``` """ def __init__( self, optimizer, init_lr, scaled_lr, warmup_duration, decay_steps, min_lr=0, power=2, ): """ Args: optimizer (:obj:`torch.optim.Optimizer`): Optimizer to use init_lr (float): Initial LR at start of training scaled_lr (float): Scaled LR to reach after warmup warmup_duration (int): Warm-up steps decay_steps (int): Decay steps for power decay power (float): Power to use """ self.warmup = LRLinearWarmUp( optimizer=optimizer, init_lr=init_lr, scaled_lr=scaled_lr, warmup_duration=warmup_duration, ) self.pow = power self.decay_steps = decay_steps self.min_lr = min_lr super(PolyDecayLRLinearWarmup, self).__init__(optimizer, self.f) self.factor = 1 def f(self, iteration): # Warmup if iteration <= self.warmup.duration: self.factor = self.warmup.f(iteration) else: diff = iteration - self.warmup.duration # If remaining decay steps if diff <= self.decay_steps: progress = diff / self.decay_steps self.factor = math.pow( (1 - progress) + math.pow(self.min_lr / self.warmup.scaled_lr, 1 / self.pow) * progress, self.pow, ) return self.factor