Tensorboard 训练分类算法的tensorboard可视化示例

Posted 洪流之源

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Tensorboard 训练分类算法的tensorboard可视化示例相关的知识,希望对你有一定的参考价值。

该示例实现了tensorboard对训练数据图像、模型计算图、学习率、损失值、准确率、梯度、模型权值参数的可视化:

# 监控loss, accuracy, weights, gradients
import numpy as np
import math
import os
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
from torchvision import datasets
from torchvision import models
import torchvision.transforms as transforms
from torch.utils.tensorboard import SummaryWriter
from tqdm import tqdm

if __name__ == '__main__':

    # 参数设置
    NUM_CLASSES = 100
    MAX_EPOCH = 100
    BATCH_SIZE = 64
    INIT_LR = 0.01
    LRF = 0.1 # 倍率因子,最后学习率会降到初始学习率的0.1倍
    LOG_INTERVAL = 10
    VAL_INTERVAL = 1
    SAVE_CHECKPOINTS_DIR = 'checkpoints'

    # 构建 SummaryWriter
    summary_writer = SummaryWriter(log_dir='logs')

    if not os.path.exists(SAVE_CHECKPOINTS_DIR):
        os.makedirs(SAVE_CHECKPOINTS_DIR)
    
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    transform_train = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(224), 
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 
    transform_test = transforms.Compose([transforms.Resize(256),
                                         transforms.CenterCrop(224), 
                                         transforms.ToTensor(),
                                         transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])
 
    train_dataset = datasets.cifar.CIFAR100(root='cifar100', train=True, transform=transform_train, download=True)
    test_dataset = datasets.cifar.CIFAR100(root='cifar100', train=False, transform=transform_test, download=True)

    # 构建batch数据
    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, 
                                            batch_size=BATCH_SIZE, 
                                            shuffle=True)

    val_loader = torch.utils.data.DataLoader(dataset=test_dataset, 
                                            batch_size=BATCH_SIZE, 
                                            shuffle=True)

    # tensorboard 显示数据
    data_batch, label_batch = next(iter(train_loader))
    images_grid = torchvision.utils.make_grid(tensor=data_batch, nrow=8, normalize=True, scale_each=True)
    summary_writer.add_image(tag='image', img_tensor=images_grid, global_step=0)

    # 模型
    net = models.resnet18(num_classes=NUM_CLASSES)
    net = net.to(device)
    
    # tensorboard可视化模型计算图
    dummy_input = torch.randn(1, 3, 224, 224).to(device)
    summary_writer.add_graph(model=net, input_to_model=dummy_input, verbose=False)

    # 损失函数
    criterion = nn.CrossEntropyLoss()                                               

    # 优化器
    optimizer = optim.SGD(net.parameters(), lr=INIT_LR, momentum=0.9)
    
    # 设置学习率下降策略               
    # scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=10, gamma=0.1)     
    lf = lambda x: ((1 + math.cos(x * math.pi / MAX_EPOCH)) / 2) * (1 - LRF) + LRF  # cosine
    scheduler = torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda=lf)

    # 训练
    train_curve = list()
    valid_curve = list()

    iter_count = 0

    for epoch in range(1, MAX_EPOCH + 1):

        loss_mean = 0.
        correct = 0.
        total = 0.

        best_acc = 0.0

        net.train()
        train_bar = tqdm(train_loader)
        for step, data in enumerate(train_bar):
            iter_count += 1

            # forward
            inputs, labels = data
            inputs = inputs.to(device)
            labels = labels.to(device)

            outputs = net(inputs)

            # backward
            optimizer.zero_grad()
            loss = criterion(outputs, labels)
            loss.backward()

            # update weights
            optimizer.step()

            # 统计分类情况
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).squeeze().sum().cpu().numpy()
            accuracy = correct / total

            # 打印训练信息
            loss_mean += loss.item()
            train_curve.append(loss.item())
            if (step + 1) % LOG_INTERVAL == 0:
                loss_mean = loss_mean / LOG_INTERVAL
                train_bar.desc = "Training:Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
                                 epoch, MAX_EPOCH, step + 1, len(train_loader), loss_mean, accuracy)
                loss_mean = 0.

            # tensorboard记录损失、准确率
            summary_writer.add_scalars("Train Loss", "Train": loss.item(), iter_count)
            summary_writer.add_scalars("Train Accuracy", "Train": accuracy, iter_count)

        # tensorboard记录每个epoch模型的梯度,权值
        for name, param in net.named_parameters():
            summary_writer.add_histogram(name + '_grad', param.grad, epoch)
            summary_writer.add_histogram(name + '_data', param, epoch)

        scheduler.step()  # 更新学习率
        lr = scheduler.get_last_lr()[0]
        # tensorboard记录每个epoch的学习率
        summary_writer.add_scalars("Learing Rate", "LR": lr, epoch)

        # 验证模型
        if (epoch + 1) % VAL_INTERVAL == 0:

            correct_val = 0.
            total_val = 0.
            loss_val = 0.
            net.eval()

            with torch.no_grad():
                for j, data in enumerate(val_loader):
                    inputs, labels = data
                    inputs = inputs.to(device)
                    labels =  labels.to(device)

                    outputs = net(inputs)
                    loss = criterion(outputs, labels)

                    _, predicted = torch.max(outputs.data, 1)
                    total_val += labels.size(0)
                    correct_val += (predicted == labels).squeeze().sum().cpu().numpy()
                    val_accuracy = correct_val / total_val
                    loss_val += loss.item()

                valid_curve.append(loss.item())
                print("Val:\\t Epoch[:0>3/:0>3] Iteration[:0>3/:0>3] Loss: :.4f Acc::.2%".format(
                      epoch, MAX_EPOCH, j + 1, len(val_loader), loss_val, val_accuracy))

                # tensorboard记录损失、准确率
                summary_writer.add_scalars("Val Loss", "Val": np.mean(valid_curve), iter_count)
                summary_writer.add_scalars("Val Accuracy", "Val": val_accuracy, iter_count)

                if val_accuracy > best_acc:
                    best_acc = val_accuracy
                    save_path = os.path.join(SAVE_CHECKPOINTS_DIR, 'best.pth')
                    torch.save(net.state_dict(), save_path)

 运行tensorboard,并指定日志目录:

tensorboard --logdir logs

训练图片可视化: 

 模型计算图可视化:

 学习率、准确率、损失等可视化:

 

 梯度、权重参数可视化:

以上是关于Tensorboard 训练分类算法的tensorboard可视化示例的主要内容,如果未能解决你的问题,请参考以下文章

Pytorch Note53 TensorBoard 可视化

使用 keras 在 tensorboard 中显示分类图像

如何使用 Tensorboard 检查训练模型的准确性?

炼丹小技巧1:通过TensorBoard查看loss走向判断模型是否收敛

炼丹小技巧1:通过TensorBoard查看loss走向判断模型是否收敛

基于pytorch实现简单的分类模型训练