神经网络学习速率设置指南（CLR Callback，LRFinder，SGDR等最新的学习率设置方案）附完整代码解析_综合

周期性学习率（CLR）

在这里插入图片描述
循环学习率是学习率调整的策略，其在周期性质中将学习率从基值增加。通常，周期的频率是恒定的，但是振幅通常在每个周期或每个小批量迭代中动态地缩放。

from keras.callbacks import *
from keras.models import Sequential, Model
from keras.layers import Dense, Activation, Input
from keras.optimizers import *
import matplotlib.pyplot as plt'''循环学习率是学习率调整的策略，其在周期性质中将学习率从基值增加。通常，周期的频率是恒定的，但是振幅通常在每个周期或每个小批量迭代中动态地缩放。'''
class CyclicLR(Callback):"""This callback implements a cyclical learning rate policy (CLR).The method cycles the learning rate between two boundaries withsome constant frequency, as detailed in this paper (https://arxiv.org/abs/1506.01186).The amplitude of the cycle can be scaled on a per-iteration orper-cycle basis.This class has three built-in policies, as put forth in the paper."triangular":A basic triangular cycle w/ no amplitude scaling."triangular2":A basic triangular cycle that scales initial amplitude by half each cycle."exp_range":A cycle that scales initial amplitude by gamma**(cycle iterations) at eachcycle iteration.For more detail, please see paper.# Example```pythonclr = CyclicLR(base_lr=0.001, max_lr=0.006,step_size=2000., mode='triangular')model.fit(X_train, Y_train, callbacks=[clr])```Class also supports custom scaling functions:```pythonclr_fn = lambda x: 0.5*(1+np.sin(x*np.pi/2.))clr = CyclicLR(base_lr=0.001, max_lr=0.006,step_size=2000., scale_fn=clr_fn,scale_mode='cycle')model.fit(X_train, Y_train, callbacks=[clr])```# Argumentsbase_lr: initial learning rate which is thelower boundary in the cycle.max_lr: upper boundary in the cycle. Functionally,it defines the cycle amplitude (max_lr - base_lr).The lr at any cycle is the sum of base_lrand some scaling of the amplitude; thereforemax_lr may not actually be reached depending onscaling function.step_size: number of training iterations perhalf cycle. Authors suggest setting step_size2-8 x training iterations in epoch.mode: one of {triangular, triangular2, exp_range}.Default 'triangular'.Values correspond to policies detailed above.If scale_fn is not None, this argument is ignored.gamma: constant in 'exp_range' scaling function:gamma**(cycle iterations)scale_fn: Custom scaling policy defined by a singleargument lambda function, where0 <= scale_fn(x) <= 1 for all x >= 0.mode paramater is ignoredscale_mode: {'cycle', 'iterations'}.Defines whether scale_fn is evaluated oncycle number or cycle iterations (trainingiterations since start of cycle). Default is 'cycle'."""def __init__(self, base_lr=0.001, max_lr=0.006, step_size=2000., mode='triangular',gamma=1., scale_fn=None, scale_mode='cycle'):super(CyclicLR, self).__init__()self.base_lr = base_lrself.max_lr = max_lrself.step_size = step_sizeself.mode = modeself.gamma = gammaif scale_fn == None:if self.mode == 'triangular':self.scale_fn = lambda x: 1.self.scale_mode = 'cycle'elif self.mode == 'triangular2':self.scale_fn = lambda x: 1 / (2. ** (x - 1))self.scale_mode = 'cycle'elif self.mode == 'exp_range':self.scale_fn = lambda x: gamma ** (x)self.scale_mode = 'iterations'else:self.scale_fn = scale_fnself.scale_mode = scale_modeself.clr_iterations = 0.self.trn_iterations = 0.self.history = {}self._reset()def _reset(self, new_base_lr=None, new_max_lr=None,new_step_size=None):"""Resets cycle iterations.Optional boundary/step size adjustment."""if new_base_lr != None:self.base_lr = new_base_lrif new_max_lr != None:self.max_lr = new_max_lrif new_step_size != None:self.step_size = new_step_sizeself.clr_iterations = 0.def clr(self):cycle = np.floor(1 + self.clr_iterations / (2 * self.step_size))x = np.abs(self.clr_iterations / self.step_size - 2 * cycle + 1)if self.scale_mode == 'cycle':return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(cycle)else:return self.base_lr + (self.max_lr - self.base_lr) * np.maximum(0, (1 - x)) * self.scale_fn(self.clr_iterations)def on_train_begin(self, logs={}):logs = logs or {}if self.clr_iterations == 0:K.set_value(self.model.optimizer.lr, self.base_lr)else:K.set_value(self.model.optimizer.lr, self.clr())def on_batch_end(self, epoch, logs=None):logs = logs or {}self.trn_iterations += 1self.clr_iterations += 1self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))self.history.setdefault('iterations', []).append(self.trn_iterations)for k, v in logs.items():self.history.setdefault(k, []).append(v)K.set_value(self.model.optimizer.lr, self.clr())if __name__ == '__main__':'''一个epoch是至将整个训练集训练一轮。如果我们令batch_size等于100（每次使用100个样本进行训练）, 那么一个epoch总共需要计算500次iteration。iteration : 一代中进行了多少次迭代　np.ceil(train_data / batch_size)'''inp = Input(shape=(15,))x = Dense(10, activation='relu')(inp)x = Dense(1, activation='sigmoid')(x)model = Model(inp, x)X = np.random.rand(2000000, 15)Y = np.random.randint(0, 2, size=2000000)clr_triangular = CyclicLR(mode='triangular')model.compile(optimizer=SGD(0.1), loss='binary_crossentropy', metrics=['accuracy'])model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)plt.figure()plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])plt.xlabel('Training Iterations')plt.ylabel('Learning Rate')plt.title("CLR - 'triangular' Policy")plt.show()# clr_triangular = CyclicLR(mode='triangular2')# model.compile(optimizer=SGD(), loss='binary_crossentropy', metrics=['accuracy'])# model.fit(X, Y, batch_size=2000, nb_epoch=20, callbacks=[clr_triangular], verbose=0)# clr_triangular._reset()# model.fit(X, Y, batch_size=2000, nb_epoch=10, callbacks=[clr_triangular], verbose=0)# plt.xlabel('Training Iterations')# plt.ylabel('Learning Rate')# plt.title("'triangular2' Policy Reset at 20000 Iterations")# plt.plot(clr_triangular.history['iterations'], clr_triangular.history['lr'])

这个类的参数包括：

base_lr：初始学习率，这是周期中的下限。这会覆盖优化器lr。默认值为0.001。
max_lr：循环中的上边界。在功能上，它定义了循环幅度（max_lr- base_lr）。任何周期的lr是base_lr幅度的总和和一些比例; 因此，max_lr根据缩放功能，实际上可能无法达到。默认0.006。
step_size：每半个周期的训练迭代次数。作者建议设定step_size = (2-8) x (training iterations in epoch)。默认2000。
mode：其中一个{‘triangular’, ‘triangular2’, ‘exp_range’}。值对应于下面详述的策略。如果scale_fn不是None，则忽略该参数。默认’triangular’。
gamma：‘exp_range’缩放功能常数，gamma^(cycle iterations)。默认1。
scale_fn：自定义扩展策略由单个参数lambda函数定义，0 <= scale_fn(x) <= 1适用于所有x >= 0。mode使用此参数时，将忽略该参数。默认None。
scale_mode：{‘cycle’, ‘iterations’}。定义是否scale_fn根据循环次数或循环迭代进行评估（自循环开始后的训练迭代）。默认是’cycle’。

详情可参照：https://github.com/bckenstler/CLR

寻找最优学习速率范围

在这里插入图片描述

写一个 Keras 回调函数，就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from keras import backend as K
from keras.callbacks import Callback,ModelCheckpoint
import matplotlib.pyplot as plt
import numpy as np# Keras 回调函数，就是追踪与一个在确定范围内变化的线性的学习速率相搭配的损失函数。
class LRFinder(Callback):def __init__(self, min_lr=1e-7, max_lr=1e-4, steps_per_epoch=None, epochs=None):super().__init__()self.min_lr = min_lrself.max_lr = max_lrself.total_iterations = steps_per_epoch * epochsself.iteration = 0self.history = {}def clr(self):'''Calculate the learning rate.'''x = self.iteration / self.total_iterationsreturn self.min_lr + (self.max_lr - self.min_lr) * xdef on_train_begin(self, logs=None):'''Initialize the learning rate to the minimum value at the start of training.'''logs = logs or {}K.set_value(self.model.optimizer.lr, self.min_lr)def on_batch_end(self, epoch, logs=None):# Record previous batch statistics and update the learning rate.logs = logs or {}self.iteration += 1self.history.setdefault('lr', []).append(K.get_value(self.model.optimizer.lr))self.history.setdefault('iterations', []).append(self.iteration)for k, v in logs.items():self.history.setdefault(k, []).append(v)K.set_value(self.model.optimizer.lr, self.clr())def plot_lr(self):# Helper function to quickly inspect the learning rate schedule.plt.plot(self.history['iterations'], self.history['lr'])plt.yscale('log')plt.xlabel('Iteration')plt.ylabel('Learning rate')plt.savefig("../images/images/plot_lr.png")plt.show()#def plot_loss(self):# '''Helper function to quickly observe the learning rate experiment results.'''plt.plot(self.history['lr'], self.history['loss'])plt.xscale('log')plt.xlabel('Learning rate')plt.ylabel('Loss')plt.savefig("../images/images/plot_loss.png")plt.show()if __name__ == '__main__':'''min_lr: The lower bound of the learning rate range for the experiment.max_lr: The upper bound of the learning rate range for the experiment.steps_per_epoch: Number of mini-batches in the dataset.epochs: Number of epochs to run experiment. Usually between 2 and 4 epochs is '''a=200batch_size = 40epochs = 3lr_finder = LRFinder(min_lr=1e-7, max_lr=1e-4, steps_per_epoch=np.ceil(a // batch_size),epochs=epochs)

设置一个学习速率表（步衰减）

在这里插入图片描述

学习速率退火的最流行方式是「步衰减」（Step Decay），其中学习率经过一定数量的训练 epochs 后下降了一定的百分比。

import numpy as np
from keras.callbacks import LearningRateScheduler
def step_decay_schedule(initial_lr=1e-3, decay_factor=0.75, step_size=10):'''Wrapper function to create a LearningRateScheduler with step decay schedule.'''def schedule(epoch):return initial_lr * (decay_factor ** np.floor(epoch / step_size))return LearningRateScheduler(schedule)if __name__ == '__main__':lr_sched = step_decay_schedule(initial_lr=1e-4, decay_factor=0.75, step_size=2)print(lr_sched)

带有重启的随机梯度下降

在这里插入图片描述

带有热重启的随机梯度下降（SGDR）与周期性方法很相似，其中一个积极的退火表与周期性「再启动」融合到原始的初始学习率之中。

class SGDRScheduler(Callback):'''Schedule learning rates with restartsA simple restart technique for stochastic gradient descent.The learning rate decays after each batch and peridically resets to itsinitial value. Optionally, the learning rate is additionally reduced by afixed factor at a predifined set of epochs.# Argumentsepochsize: Number of samples per epoch during training.batchsize: Number of samples per batch during training.start_epoch: First epoch where decay is applied.epochs_to_restart: Initial number of epochs before restarts.mult_factor: Increase of epochs_to_restart after each restart.lr_fac: Decrease of learning rate at epochs given inlr_reduction_epochs.lr_reduction_epochs: Fixed list of epochs at which to reducelearning rate.# References- [SGDR: Stochastic Gradient Descent with Restarts](http://arxiv.org/abs/1608.03983)'''def __init__(self,epochsize,batchsize,epochs_to_restart=2,mult_factor=2,lr_fac=0.1,lr_reduction_epochs=(60, 120, 160)):super(SGDRScheduler, self).__init__()self.epoch = -1self.batch_since_restart = 0self.next_restart = epochs_to_restartself.epochsize = epochsizeself.batchsize = batchsizeself.epochs_to_restart = epochs_to_restartself.mult_factor = mult_factorself.batches_per_epoch = self.epochsize / self.batchsizeself.lr_fac = lr_facself.lr_reduction_epochs = lr_reduction_epochsself.lr_log = []def on_train_begin(self, logs={}):self.lr = K.get_value(self.model.optimizer.lr)def on_epoch_begin(self, epoch, logs={}):self.epoch += 1def on_batch_end(self, batch, logs={}):fraction_to_restart = self.batch_since_restart / \(self.batches_per_epoch * self.epochs_to_restart)lr = 0.5 * self.lr * (1 + np.cos(fraction_to_restart * np.pi))K.set_value(self.model.optimizer.lr, lr)self.batch_since_restart += 1self.lr_log.append(lr)def on_epoch_end(self, epoch, logs={}):if self.epoch + 1 == self.next_restart:self.batch_since_restart = 0self.epochs_to_restart *= self.mult_factorself.next_restart += self.epochs_to_restartif (self.epoch + 1) in self.lr_reduction_epochs:self.lr *= self.lr_fac