与标准卷积相比，为啥在训练网络时瓶颈结构更慢且占用更多内存？

Posted 2023-02-22

技术标签:

【中文标题】与标准卷积相比，为啥在训练网络时瓶颈结构更慢且占用更多内存？【英文标题】：Why bottleneck structure is slower and costs more memory when training a network comparing to standard convolution?与标准卷积相比，为什么在训练网络时瓶颈结构更慢且占用更多内存？ 【发布时间】：2019-06-27 11:03:01 【问题描述】：

我正在使用 vnet 来训练模型。我想用更少的内存更快地训练模型。所以我将标准的 3x3 卷积替换为 [1x1, 3x3 , 1x1] 卷积的组合。第一个 1x1 conv 会将通道减少到 1/N 以降低内存成本。代码如下。

前两类是瓶颈结构和标准卷积。当我将标准卷积替换为瓶颈结构时，虽然模型大小和 flops 减少了，但实际 GPU 内存成本和训练时间却增加了。

例如，我得到了：

Using standard convolution..........
Total parameters : 10,052,609 float, model size : 39,268.00390625M
191.78 GFLOPs
end : 10.62517523765564s
Max memory allocated : 3818.25341796875M

Using bottleneck...........
Total parameters : 1,145,061 float, model size : 4,472.89453125M
16.05 GFLOPs
end : 16.890745162963867s
Max memory allocated : 4408.35107421875 M

然而，在推理阶段，瓶颈结构可以在一定程度上加速网络。

有谁知道为什么会发生这种情况以及如何在训练和推理阶段加速网络？

代码：

import torch
import torch.nn as nn
import torch.nn.functional as F

def groupNorm(channel, num_groups=16):
    return nn.GroupNorm(num_groups=num_groups, num_channels=channel)

Norm = nn.BatchNorm3d

BottleNeck_Ratio = 4


class BottleNeck(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1, N=BottleNeck_Ratio):
        super(BottleNeck, self).__init__()

        self.conv_1 = nn.Conv3d(in_channels=in_channels, out_channels=out_channels // N, kernel_size=1, stride=1)
        self.conv_2 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels // N, kernel_size=kernel_size,
                                stride=stride, padding=padding)
        self.conv_3 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels, kernel_size=1, stride=1)

        self.norm = Norm(out_channels)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout3d(drop)

    def forward(self, input):
        x = self.conv_1(input)
        x = self.conv_2(x)
        x = self.conv_3(x)
        return self.drop(self.relu(self.norm(x)))


class CBR(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1):
        super(CBR, self).__init__()

        self.conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
                              stride=stride, padding=padding)
        self.norm = Norm(out_channels)
        self.relu = nn.ReLU()
        self.drop = nn.Dropout3d(drop)

    def forward(self, input):
        return self.drop(self.relu(self.norm(self.conv(input))))


ConvBnReluDrop = BottleNeck


class ResidualDown(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, down=True):
        super(ResidualDown, self).__init__()

        if down:
            self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=2, stride=2, padding=0, drop=drop)
        else:
            self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)

        self.convs = nn.ModuleList()
        for i in range(conv_nums):
            self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))

        self.has_down = down

    def forward(self, x):
        # downsample
        res = self.down(x)
        # convolution
        out = res
        for conv in self.convs:
            out = conv(out)
        # residual
        return out + res


class ResidualUp(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, up=True):
        super(ResidualUp, self).__init__()
        if up:
            self.deconv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=2, stride=2)
        else:
            self.deconv = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)

        self.convs = nn.ModuleList()
        self.convs.append(ConvBnReluDrop(2 * out_channels, out_channels, kernel_size, drop))
        for i in range(conv_nums - 1):
            self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))

    def forward(self, big, small):
        x = self.deconv(small)
        # interpolate to prevent size not match
        x = F.interpolate(x, big.size()[-3:], mode='trilinear', align_corners=False)
        # save x as residual, [out_ch]
        res = x

        # skip connection, concat and conv to small's channel
        # [2*out_ch] => [out_ch]
        x = torch.cat([big, x], 1)
        for conv in self.convs:
            x = conv(x)
        return x + res


class VBNet(nn.Module):
    def __init__(self, in_ch=1, nclass=1, drop=0.01, level=5, bn='batch', bottleneck=False):
        super(VBNet, self).__init__()
        # levels
        self.level = level
        # Normalization layer
        global Norm
        if bn == 'batch':
            Norm = nn.BatchNorm3d
        elif bn == 'group':
            Norm = groupNorm
        # elif bn == 'syncbn':
        #     Norm = SyncBN3d
        else:
            raise Exception("Error for bn")

        global ConvBnReluDrop
        if bottleneck:
            ConvBnReluDrop = BottleNeck
        else:
            ConvBnReluDrop = CBR


        # down 2
        self.downs = nn.ModuleList()
        self.downs.append(ResidualDown(in_ch, 16, 3, drop, 1, False))
        self.downs.append(ResidualDown(16, 32, 3, drop, 2))

        # down layers
        channels = 32
        for i in range(level - 2):
            self.downs.append(ResidualDown(channels, channels * 2, 3, drop, 3))
            channels *= 2

        # up layers
        self.ups = nn.ModuleList()
        for i in range(level - 3):
            self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 3))
            channels = channels // 2

        # up 2
        self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 2))
        channels = channels // 2
        self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 1, False))
        channels = channels // 2

        # classifier
        self.classifier = nn.Conv3d(channels, nclass, kernel_size=1)

    def forward(self, x): # 4,472.89453125M
        outs = []
        for layer in self.downs:
            x = layer(x)
            outs.append(x)

        small = outs[-1]
        for i in range(len(self.ups)):
            layer = self.ups[i]
            big = outs[self.level - i - 2]
            small = layer(big, small)

        out = self.classifier(small)
        return out

def get_net_size(net):
    params = list(net.parameters())
    k = 0
    for i in params:
        l = 1
        for j in i.size():
            l *= j
        k = k + l
    s = ("Total parameters : :, float, model size : :,M".format(k, k * 4 / 1024))
    return s

if __name__ == '__main__':
    # count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
    import count_ops
    import os
    import time

    os.environ['CUDA_VISIBLE_DEVICES'] = '1'

    # 4003728896
    print("Using standard convolution..........")
    a = torch.randn(6, 1, 32, 128, 128)
    net = VBNet(bn='batch', bottleneck=False)
    print(get_net_size(net))
    print(count_ops(net, a))

    net = net.cuda()
    start = time.time()
    for i in range(10):
        a = torch.randn(6, 1, 32, 128, 128).cuda()
        b = net(a)
        b.sum().backward()
    print('end : s'.format(time.time() - start))
    print("Max memory allocated : M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))

    # 4543840768 4622491136
    print("\nUsing bottleneck...........")
    # torch.cuda.reset_max_memory_allocated(0)
    a = torch.randn(6, 1, 32, 128, 128)
    net = VBNet(bn='batch', bottleneck=True)
    print(get_net_size(net))
    print(count_ops(net, a))

    net = net.cuda()
    start = time.time()
    for i in range(10):
        a = torch.randn(6, 1, 32, 128, 128).cuda()
        b = net(a)
        b.sum().backward()
    print('end : s'.format(time.time() - start))
    print("Max memory allocated :  M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))

【问题讨论】：

关于GPU内存，你should not use nvidia-smi。改用torch.cuda.max_memory_allocated()？感谢您的建议，我使用代码来获取内存使用情况。但结果仍然表明，当使用bottleneck结构时，需要更多的内存来训练模型。 【参考方案1】：

我比较了三种卷积：标准卷积、瓶颈结构和可分离卷积，得到了性能结果：

对于标准卷积：

Total parameters : 13920 float, model size : 54.3750M
2.75 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time   : 8.0517 s
Total iteration    : 250
mean forward  time : 0.0003 s
mean backward time : 0.0007 s
Max memory allocated : 120.1846 M
-------------------Test  analyze----------------
total test time    : 7.6900 s
Total iteration    : 250
mean data     time : 0.0305 s
mean forward  time : 0.0003 s
Max memory allocated : 72.1826 M

对于瓶颈：

Total parameters : 7872 float, model size : 30.7500M
1.56 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time   : 8.7080 s
Total iteration    : 250
mean forward  time : 0.0009 s
mean backward time : 0.0016 s
Max memory allocated : 168.0767 M
-------------------Test  analyze----------------
total test time    : 8.8901 s
Total iteration    : 250
mean data     time : 0.0348 s
mean forward  time : 0.0008 s
Max memory allocated : 72.0728 M

对于可分离卷积：

Total parameters : 1088 float, model size : 4.2500M
0.23 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time   : 8.3567 s
Total iteration    : 250
mean forward  time : 0.0009 s
mean backward time : 0.0014 s
Max memory allocated : 144.2021 M
-------------------Test  analyze----------------
total test time    : 7.9258 s
Total iteration    : 250
mean data     time : 0.0309 s
mean forward  time : 0.0008 s
Max memory allocated : 72.1992 M

我们可以看到标准卷积比瓶颈结构和可分离卷积快两倍。而且它的内存消耗也不比其他两种方法大。

我猜原因可能是在训练中向前或向后训练时，具有更多卷积模块的瓶颈和可分离结构将使用更多内存来保存反向传播的输入，它们也会比标准卷积做更多的卷积操作。所以这两种结构无论是内存消耗还是速度都可以超越标准卷积。

可分离卷积较慢的另一个原因可能是 cuDNN 库不直接支持深度可分离卷积。

但是与标准卷积相比，这两种结构确实大大减小了模型尺寸，这对于移动设备非常有用。

代码如下：

三种不同的卷积。

import torch
import torch.nn as nn
import analyze_network_performance
import functools

Norm = nn.BatchNorm3d


class CBRSeq(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
        super(CBRSeq, self).__init__()
        self.seq = nn.Sequential(
            nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
            Norm(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, input):
        return self.seq(input)


class BottleNeckSeq(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
        super(BottleNeckSeq, self).__init__()
        self.seq = nn.Sequential(
            nn.Conv3d(in_channels=in_channels, out_channels=out_channels//N, kernel_size=1, stride=1),
            Norm(out_channels//N),
            nn.ReLU(inplace=True),

            nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels//N, kernel_size=kernel_size, stride=stride, padding=padding),
            Norm(out_channels//N),
            nn.ReLU(inplace=True),

            nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels, kernel_size=1),
            Norm(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, input):
        return self.seq(input)


class GroupSeq(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
        super(GroupSeq, self).__init__()
        self.seq = nn.Sequential(
            nn.Conv3d(in_channels=in_channels, out_channels=in_channels, groups=in_channels,
                      kernel_size=kernel_size, stride=stride, padding=padding),
            Norm(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
            Norm(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, input):
        return self.seq(input)


def test_bottleneck():

    data_gen = functools.partial(torch.randn, 6, 16, 32, 32, 32)

    a = BottleNeckSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)

    b = CBRSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)

    c = GroupSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)

    print('BottleNeck Structure ....')
    analyze_network_performance(a, data_gen, train_time=250, test_time=250)

    print('\nStandard Convolution ....')
    analyze_network_performance(b, data_gen, train_time=250, test_time=250)

    print('\nSeparable Convolution ...')
    analyze_network_performance(c, data_gen, train_time=250, test_time=250)


if __name__ == '__main__':
    test_bottleneck()

analyze_network_performance 代码。

import time
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
from ops import count_ops
import torch
import numpy as np

def get_net_size(net):
    params = list(net.parameters())
    k = 0
    for i in params:
        l = 1
        for j in i.size():
            l *= j
        k = k + l
    s = ("Total parameters : : float, model size : :.4fM".format(k, k * 4 / 1024))
    return s

class Timer(object):
    def __init__(self, verbose=False):
        self.start_time = time.time()
        self.verbose = verbose
        self.duration = 0

    def restart(self):
        self.duration = self.start_time = time.time()
        return self.duration

    def stop(self):
        return time.time() - self.start_time

    def get_last_duration(self):
        return self.duration

    def __enter__(self):
        self.restart()

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.duration = self.stop()
        if self.verbose:
            print(':^.4f s'.format(self.stop()))


def to_cuda(data, device):
    if device < 0:
        return data
    else:
        return data.cuda(device)


def network_train_analyze(net, data_generate_func, cuda=0, train_time=10, forward_verbose=False):
    t1 = Timer(verbose=True)
    t2 = Timer(forward_verbose)
    t3 = Timer(verbose=False)
    if cuda >= 0:
        torch.cuda.reset_max_memory_allocated(cuda)
    forward_times = []
    backward_times = []
    with t1:
        for i in range(train_time):
            a = to_cuda(data_generate_func(), cuda)
            with t3:
                b = net(a)
                if forward_verbose:
                    print('forward  : ', end='')
            forward_times.append(t3.get_last_duration())

            with t2:
                b.sum().backward()
                if forward_verbose:
                    print('backward : ', end='')
            backward_times.append(t2.get_last_duration())
        print('total train time   : ', end='')
    print("Total iteration    : ".format(train_time))
    print('mean forward  time : :^.4f s'.format(np.mean(forward_times[1:])))
    print('mean backward time : :^.4f s'.format(np.mean(backward_times[1:])))
    if cuda >= 0:
        print("Max memory allocated : :^.4f M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))


def network_test_analyze(net, data_generate_func, cuda=0, test_time=50, forward_verbose=False):
    t1 = Timer(verbose=True)
    t2 = Timer(verbose=forward_verbose)
    t3 = Timer(verbose=False)
    if cuda >= 0:
        torch.cuda.reset_max_memory_allocated(cuda)
    forward_times = []
    data_times = []
    with t1:
        with torch.no_grad():
            for i in range(test_time):
                with t3:
                    a = to_cuda(data_generate_func(), cuda)
                data_times.append(t3.get_last_duration())

                with t2:
                    net(a)
                    if forward_verbose:
                        print('forward  : ', end='')
                forward_times.append(t2.get_last_duration())
        print('total test time    : ', end='')
    print("Total iteration    : ".format(test_time))
    print('mean data     time : :^.4f s'.format(np.mean(data_times[1:])))
    print('mean forward  time : :^.4f s'.format(np.mean(forward_times[1:])))
    if cuda >= 0:
        print("Max memory allocated : :^.4f M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))


def analyze_network_performance(net, data_generate_func, cuda=0, train_time=10, test_time=20, forward_verbose=False):

    print('============ Analyzing network performance ==============')

    print(get_net_size(net))

    net = to_cuda(net, cuda)
    a = data_generate_func()
    a = to_cuda(a, cuda)
    print(count_ops(net, a))

    print('-------------------Train analyze----------------')
    network_train_analyze(net, data_generate_func, cuda, train_time, forward_verbose)

    print('-------------------Test  analyze----------------')
    network_test_analyze(net, data_generate_func, cuda, test_time, forward_verbose)

【讨论】：

以上是关于与标准卷积相比，为啥在训练网络时瓶颈结构更慢且占用更多内存？的主要内容，如果未能解决你的问题，请参考以下文章