Tensorboard 训练分类算法的tensorboard可视化示例
Posted 洪流之源
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Tensorboard 训练分类算法的tensorboard可视化示例相关的知识,希望对你有一定的参考价值。
该示例实现了tensorboard对训练数据图像、模型计算图、学习率、损失值、准确率、梯度、模型权值参数的可视化:
# 监控loss, accuracy, weights, gradients
import numpy as np
import math
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torchvision import models
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm
if __name__ == '__main__':
# 参数设置
NUM_CLASSES = 100
MAX_EPOCH = 100
BATCH_SIZE = 64
INIT_LR = 0.01
LRF = 0.1 # 倍率因子,最后学习率会降到初始学习率的0.1倍
LOG_INTERVAL = 10
VAL_INTERVAL = 1
SAVE_CHECKPOINTS_DIR = 'checkpoints'
# 构建 SummaryWriter
summary_writer = SummaryWriter(log_dir='logs')
if not os.path.exists(SAVE_CHECKPOINTS_DIR):
os.makedirs(SAVE_CHECKPOINTS_DIR)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
transform_train = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
transform_test = transforms.Compose([transforms.Resize(256),
transforms.CenterCrop(224),
transforms.ToTensor(),
transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
train_dataset = datasets.cifar.CIFAR100(root='cifar100', train=True, transform=transform_train, download=True)
test_dataset = datasets.cifar.CIFAR100(root='cifar100', train=False, transform=transform_test, download=True)
# 构建batch数据
train_loader = torch.utils.data.DataLoader(dataset=train_dataset,
batch_size=BATCH_SIZE,
shuffle=True)
val_loader = torch.utils.data.DataLoader(dataset=test_dataset,
batch_size=BATCH_SIZE,
shuffle=True)
# tensorboard 显示数据
data_batch, label_batch = next(iter(train_loader))
images_grid = torchvision.utils.make_grid(tensor=data_batch, nrow=8, normalize=True, scale_each=True)
summary_writer.add_image(tag='image', img_tensor=images_grid, global_step=0)
# 模型
net = models.resnet18(num_classes=NUM_CLASSES)
net = net.to(device)
# tensorboard可视化模型计算图
dummy_input = torch.randn(1, 3, 224, 224).to(device)
summary_writer.add_graph(model=net, input_to_model=dummy_input, verbose=False)
# 损失函数
criterion = nn.CrossEntropyLoss()
# 优化器
optimizer = optim.SGD(net.parameters(), lr=INIT_LR, momentum=0.9)
# 设置学习率下降策略
# scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)
lf = lambda x: ((1 + math.cos(x * math.pi / MAX_EPOCH)) / 2) * (1 - LRF) + LRF # cosine
scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)
# 训练
train_curve = list()
valid_curve = list()
iter_count = 0
for epoch in range(1, MAX_EPOCH + 1):
loss_mean = 0.
correct = 0.
total = 0.
best_acc = 0.0
net.train()
train_bar = tqdm(train_loader)
for step, data in enumerate(train_bar):
iter_count += 1
# forward
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs)
# backward
optimizer.zero_grad()
loss = criterion(outputs, labels)
loss.backward()
# update weights
optimizer.step()
# 统计分类情况
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).squeeze().sum().cpu().numpy()
accuracy = correct / total
# 打印训练信息
loss_mean += loss.item()
train_curve.append(loss.item())
if (step + 1) % LOG_INTERVAL == 0:
loss_mean = loss_mean / LOG_INTERVAL
train_bar.desc = "Training:Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
epoch, MAX_EPOCH, step + 1, len(train_loader), loss_mean, accuracy)
loss_mean = 0.
# tensorboard记录损失、准确率
summary_writer.add_scalars("Train Loss", "Train": loss.item(), iter_count)
summary_writer.add_scalars("Train Accuracy", "Train": accuracy, iter_count)
# tensorboard记录每个epoch模型的梯度,权值
for name, param in net.named_parameters():
summary_writer.add_histogram(name + '_grad', param.grad, epoch)
summary_writer.add_histogram(name + '_data', param, epoch)
scheduler.step() # 更新学习率
lr = scheduler.get_last_lr()[0]
# tensorboard记录每个epoch的学习率
summary_writer.add_scalars("Learing Rate", "LR": lr, epoch)
# 验证模型
if (epoch + 1) % VAL_INTERVAL == 0:
correct_val = 0.
total_val = 0.
loss_val = 0.
net.eval()
with torch.no_grad():
for j, data in enumerate(val_loader):
inputs, labels = data
inputs = inputs.to(device)
labels = labels.to(device)
outputs = net(inputs)
loss = criterion(outputs, labels)
_, predicted = torch.max(outputs.data, 1)
total_val += labels.size(0)
correct_val += (predicted == labels).squeeze().sum().cpu().numpy()
val_accuracy = correct_val / total_val
loss_val += loss.item()
valid_curve.append(loss.item())
print("Val:\\t Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
epoch, MAX_EPOCH, j + 1, len(val_loader), loss_val, val_accuracy))
# tensorboard记录损失、准确率
summary_writer.add_scalars("Val Loss", "Val": np.mean(valid_curve), iter_count)
summary_writer.add_scalars("Val Accuracy", "Val": val_accuracy, iter_count)
if val_accuracy > best_acc:
best_acc = val_accuracy
save_path = os.path.join(SAVE_CHECKPOINTS_DIR, 'best.pth')
torch.save(net.state_dict(), save_path)
运行tensorboard,并指定日志目录:
tensorboard --logdir logs
训练图片可视化:
模型计算图可视化:
学习率、准确率、损失等可视化:
梯度、权重参数可视化:
以上是关于Tensorboard 训练分类算法的tensorboard可视化示例的主要内容,如果未能解决你的问题,请参考以下文章
Pytorch Note53 TensorBoard 可视化
使用 keras 在 tensorboard 中显示分类图像
炼丹小技巧1:通过TensorBoard查看loss走向判断模型是否收敛