warmup与余弦退火学习率
Posted 洪流之源
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了warmup与余弦退火学习率相关的知识,希望对你有一定的参考价值。
warmup与余弦退火学习率的主要思路是学习率先从很小的数值线性增加到预设学习率,然后按照余弦函数值进行衰减。为什么要这么做的呢?主要有如下原因:
1. 在模型的训练之初weights是随机初始化的,可以理解模型对数据的“理解程度”为0,也就是说没有任何先验知识,在第一个epoche中,每个batch的数据对模型来说都是新的,模型会根据输入的数据进行快速调参,此时如果采用较大的学习率的话,有很大的可能使模型对于数据“过拟合”,后续需要更多的轮次才能“拉回来”;
2. 当模型训练一段时间之后(如:10epoch或10000steps),模型对数据具有一定的先验知识,此时使用较大的学习率模型就不容易学“偏”,可以使用较大的学习率加速模型收敛;
3. 当模型使用较大的学习率训练一段时间之后,模型的分布相对比较稳定,此时不宜从数据中再学到新特点,如果仍使用较大的学习率会破坏模型的稳定性,而使用小学习率更容易获取局部最优值。
学习率的变化趋势类似下图所示,学习率先从很小的数值线性增加到一个最大的学习率,然后按照余弦函数值进行衰减:
在pytorch中实现warmup与余弦退火学习率的伪代码如下:
from warmup_scheduler import GradualWarmupScheduler
MAX_EPOCH=100
INIT_LR = 0.01
WARMUP_LR_TIMES = 10
WARMUP_EPOCH = 5
optimizer = optim.SGD(net.parameters(), lr=INIT_LR, momentum=0.9)
cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer,
T_max=MAX_EPOCH,
eta_min=0,
last_epoch=-1)
scheduler = GradualWarmupScheduler(optimizer,
multiplier=WARMUP_LR_TIMES,
total_epoch=WARMUP_EPOCH,
after_scheduler=cosine_scheduler)
for epoch in range(1, MAX_EPOCH):
... ...
optimizer.step()
... ...
scheduler.step(epoch)
print(epoch, optim.param_groups[0]['lr'])
... ...
API :
torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max, eta_min=0, last_epoch=-1, verbose=False)
optimizer:学习率调整器作用的优化器,在初始化optimizer时就设定了初始学习率。
T_max:最大迭代次数或最大epoch数量。
eta_min:最小学习率。
last_epoch:最后一个epoch的index,默认值为-1。如果是训练了很多个epoch后中断了,继续训练,这个值就设置为加载模型的epoch,-1表示总是从头开始训练。
verbose:若为true,每次更新时会打印一些信息。
GradualWarmupScheduler(optimizer, multiplier, total_epoch, after_scheduler)
optimizer:学习率调整器作用的优化器
multiplier:当multiplier=1.0时,学习率lr从0开始增到base_lr为止,当multiplier大于1.0时,学习率lr从base_lr开始增到base_lr*multiplier为止,multiplier不能小于1.0,那么base_lr又是什么?
就是传入优化器的lr,例如optimizer = optim.SGD(net.parameters(), lr=learning_rate, momentum=0.9, weight_decay=weight_decay),base_lr就是learning_rate。
total_epoch:在total_epoch个epoch后达到目标学习率,也就是warmup持续的代数
after_scheduler:在经过total_epoch个epoch以后,所使用的学习率策略。
注意:GradualWarmupScheduler不是pytorch的官方实现,需要执行如下命令安装:
pip install warmup_scheduler
分类算法训练流程中warmup与余弦退火学习率的示例:
# 监控loss, accuracy, weights, gradients
import numpy as np
import math
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torchvision import models
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
from warmup_scheduler import GradualWarmupScheduler
if __name__ == '__main__':
# 参数设置
NUM_CLASSES = 100
MAX_EPOCH = 100
BATCH_SIZE = 256
INIT_LR = 0.01
LRF = 0.1
WARMUP_LR_TIMES = 10
WARMUP_EPOCH = 5
LOG_INTERVAL = 10
VAL_INTERVAL = 1
SAVE_CHECKPOINTS_DIR = 'checkpoints'
# 构建 SummaryWriter
summary_writer = SummaryWriter(log_dir='logs')
if not os.path.exists(SAVE_CHECKPOINTS_DIR):
os.makedirs(SAVE_CHECKPOINTS_DIR)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transform_train = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform_test = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
train_dataset = datasets.cifar.CIFAR100(root='cifar100', train=True, transform=transform_train, download=True)
test_dataset = datasets.cifar.CIFAR100(root='cifar100', train=False, transform=transform_test, download=True)
# 构建batch数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=BATCH_SIZE,
shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=BATCH_SIZE,
shuffle=True)
# tensorboard 显示数据
data_batch, label_batch = next(iter(train_loader))
images_grid = torchvision.utils.make_grid(tensor=data_batch, nrow=8, normalize=True, scale_each=True)
summary_writer.add_image(tag='image', img_tensor=images_grid, global_step=0)
# 模型
net = models.resnet18(num_classes=NUM_CLASSES)
net = net.to(device)
# tensorboard可视化模型计算图
dummy_input = torch.randn(1, 3, 224, 224).to(device)
summary_writer.add_graph(model=net, input_to_model=dummy_input, verbose=False)
# 损失函数
criterion = nn.CrossEntropyLoss()
# 优化器
optimizer = optim.SGD(net.parameters(), lr=INIT_LR, momentum=0.9)
# 设置学习率下降策略
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
# lf = lambda x: ((1 + math.cos(x * math.pi / MAX_EPOCH)) / 2) * (1 - LRF) + LRF # cosine
# scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
cosine_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer=optimizer, T_max=MAX_EPOCH, eta_min=0, last_epoch=-1)
scheduler = GradualWarmupScheduler(optimizer, multiplier=WARMUP_LR_TIMES, total_epoch=WARMUP_EPOCH, after_scheduler=cosine_scheduler)
# 训练
train_curve = list()
valid_curve = list()
iter_count = 0
for epoch in range(1, MAX_EPOCH + 1):
loss_mean = 0.
correct = 0.
total = 0.
best_acc = 0.0
net.train()
train_bar = tqdm(train_loader)
for step, data in enumerate(train_bar):
iter_count += 1
# forward
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs)
# backward
optimizer.zero_grad()
loss = criterion(outputs, labels)
loss.backward()
# update weights
optimizer.step()
# 统计分类情况
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).squeeze().sum().cpu().numpy()
accuracy = correct / total
# 打印训练信息
loss_mean += loss.item()
train_curve.append(loss.item())
if (step + 1) % LOG_INTERVAL == 0:
loss_mean = loss_mean / LOG_INTERVAL
train_bar.desc = "Training:Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
epoch, MAX_EPOCH, step + 1, len(train_loader), loss_mean, accuracy)
loss_mean = 0.
# tensorboard记录损失、准确率
summary_writer.add_scalars("Train Loss", "Train": loss.item(), iter_count)
summary_writer.add_scalars("Train Accuracy", "Train": accuracy, iter_count)
# tensorboard记录每个epoch模型的梯度,权值
for name, param in net.named_parameters():
summary_writer.add_histogram(name + '_grad', param.grad, epoch)
summary_writer.add_histogram(name + '_data', param, epoch)
# 更新学习率
# scheduler.step()
scheduler.step(epoch)
# lr = scheduler.get_last_lr()[0]
lr = optimizer.param_groups[0]['lr']
# tensorboard记录每个epoch的学习率
summary_writer.add_scalars("Learing Rate", "LR": lr, epoch)
# 验证模型
if (epoch + 1) % VAL_INTERVAL == 0:
correct_val = 0.
total_val = 0.
loss_val = 0.
net.eval()
with torch.no_grad():
for j, data in enumerate(val_loader):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs)
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total_val += labels.size(0)
correct_val += (predicted == labels).squeeze().sum().cpu().numpy()
val_accuracy = correct_val / total_val
loss_val += loss.item()
valid_curve.append(loss.item())
print("Val:\\t Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
epoch, MAX_EPOCH, j + 1, len(val_loader), loss_val, val_accuracy))
# tensorboard记录损失、准确率
summary_writer.add_scalars("Val Loss", "Val": np.mean(valid_curve), iter_count)
summary_writer.add_scalars("Val Accuracy", "Val": val_accuracy, iter_count)
if val_accuracy > best_acc:
best_acc = val_accuracy
save_path = os.path.join(SAVE_CHECKPOINTS_DIR, 'best.pth')
torch.save(net.state_dict(), save_path)
在上述代码中设置了GradualWarmupScheduler的multipier参数为10、total_epoch为5,SDG的初始化lr为0.01,则前5个epoch学习率会从0.01开始增加到0.01 x 10,然后开始下降,SDG的初始化lr将会是整个训练过程当中最小的学习率(如果设置multipier=1,前5个epoch学习率将从0升到0.01,然后开始下降,SDG的初始化lr将会是整个训练过程当中最大的学习率),tensorboard可视化后的学习率如下:
参考:
https://github.com/ildoonet/pytorch-gradual-warmup-lr
(467条消息) CosineAnnealingLR(余弦退火调整学习率)_DEDSEC_Roger的博客-CSDN博客_cosineannealinglr使用
(467条消息) 【介绍+代码实现】使用GradualWarmupScheduler进行学习率预热_阿毛啊阿阿的博客-CSDN博客_warmup_scheduler
以上是关于warmup与余弦退火学习率的主要内容,如果未能解决你的问题,请参考以下文章