与标准卷积相比,为啥在训练网络时瓶颈结构更慢且占用更多内存?
Posted
技术标签:
【中文标题】与标准卷积相比,为啥在训练网络时瓶颈结构更慢且占用更多内存?【英文标题】:Why bottleneck structure is slower and costs more memory when training a network comparing to standard convolution?与标准卷积相比,为什么在训练网络时瓶颈结构更慢且占用更多内存? 【发布时间】:2019-06-27 11:03:01 【问题描述】:我正在使用 vnet 来训练模型。我想用更少的内存更快地训练模型。所以我将标准的 3x3 卷积替换为 [1x1, 3x3 , 1x1] 卷积的组合。第一个 1x1 conv 会将通道减少到 1/N 以降低内存成本。代码如下。
前两类是瓶颈结构和标准卷积。当我将标准卷积替换为瓶颈结构时,虽然模型大小和 flops 减少了,但实际 GPU 内存成本和训练时间却增加了。
例如,我得到了:
Using standard convolution..........
Total parameters : 10,052,609 float, model size : 39,268.00390625M
191.78 GFLOPs
end : 10.62517523765564s
Max memory allocated : 3818.25341796875M
Using bottleneck...........
Total parameters : 1,145,061 float, model size : 4,472.89453125M
16.05 GFLOPs
end : 16.890745162963867s
Max memory allocated : 4408.35107421875 M
然而,在推理阶段,瓶颈结构可以在一定程度上加速网络。
有谁知道为什么会发生这种情况以及如何在训练和推理阶段加速网络?
代码:
import torch
import torch.nn as nn
import torch.nn.functional as F
def groupNorm(channel, num_groups=16):
return nn.GroupNorm(num_groups=num_groups, num_channels=channel)
Norm = nn.BatchNorm3d
BottleNeck_Ratio = 4
class BottleNeck(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1, N=BottleNeck_Ratio):
super(BottleNeck, self).__init__()
self.conv_1 = nn.Conv3d(in_channels=in_channels, out_channels=out_channels // N, kernel_size=1, stride=1)
self.conv_2 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels // N, kernel_size=kernel_size,
stride=stride, padding=padding)
self.conv_3 = nn.Conv3d(in_channels=out_channels // N, out_channels=out_channels, kernel_size=1, stride=1)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
x = self.conv_1(input)
x = self.conv_2(x)
x = self.conv_3(x)
return self.drop(self.relu(self.norm(x)))
class CBR(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, stride=1, padding=1):
super(CBR, self).__init__()
self.conv = nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size,
stride=stride, padding=padding)
self.norm = Norm(out_channels)
self.relu = nn.ReLU()
self.drop = nn.Dropout3d(drop)
def forward(self, input):
return self.drop(self.relu(self.norm(self.conv(input))))
ConvBnReluDrop = BottleNeck
class ResidualDown(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, down=True):
super(ResidualDown, self).__init__()
if down:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=2, stride=2, padding=0, drop=drop)
else:
self.down = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
for i in range(conv_nums):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
self.has_down = down
def forward(self, x):
# downsample
res = self.down(x)
# convolution
out = res
for conv in self.convs:
out = conv(out)
# residual
return out + res
class ResidualUp(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, drop, conv_nums, up=True):
super(ResidualUp, self).__init__()
if up:
self.deconv = nn.ConvTranspose3d(in_channels, out_channels, kernel_size=2, stride=2)
else:
self.deconv = ConvBnReluDrop(in_channels, out_channels, kernel_size=3, stride=1, padding=1, drop=drop)
self.convs = nn.ModuleList()
self.convs.append(ConvBnReluDrop(2 * out_channels, out_channels, kernel_size, drop))
for i in range(conv_nums - 1):
self.convs.append(ConvBnReluDrop(out_channels, out_channels, kernel_size, drop))
def forward(self, big, small):
x = self.deconv(small)
# interpolate to prevent size not match
x = F.interpolate(x, big.size()[-3:], mode='trilinear', align_corners=False)
# save x as residual, [out_ch]
res = x
# skip connection, concat and conv to small's channel
# [2*out_ch] => [out_ch]
x = torch.cat([big, x], 1)
for conv in self.convs:
x = conv(x)
return x + res
class VBNet(nn.Module):
def __init__(self, in_ch=1, nclass=1, drop=0.01, level=5, bn='batch', bottleneck=False):
super(VBNet, self).__init__()
# levels
self.level = level
# Normalization layer
global Norm
if bn == 'batch':
Norm = nn.BatchNorm3d
elif bn == 'group':
Norm = groupNorm
# elif bn == 'syncbn':
# Norm = SyncBN3d
else:
raise Exception("Error for bn")
global ConvBnReluDrop
if bottleneck:
ConvBnReluDrop = BottleNeck
else:
ConvBnReluDrop = CBR
# down 2
self.downs = nn.ModuleList()
self.downs.append(ResidualDown(in_ch, 16, 3, drop, 1, False))
self.downs.append(ResidualDown(16, 32, 3, drop, 2))
# down layers
channels = 32
for i in range(level - 2):
self.downs.append(ResidualDown(channels, channels * 2, 3, drop, 3))
channels *= 2
# up layers
self.ups = nn.ModuleList()
for i in range(level - 3):
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 3))
channels = channels // 2
# up 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 2))
channels = channels // 2
self.ups.append(ResidualUp(channels, channels // 2, 3, drop, 1, False))
channels = channels // 2
# classifier
self.classifier = nn.Conv3d(channels, nclass, kernel_size=1)
def forward(self, x): # 4,472.89453125M
outs = []
for layer in self.downs:
x = layer(x)
outs.append(x)
small = outs[-1]
for i in range(len(self.ups)):
layer = self.ups[i]
big = outs[self.level - i - 2]
small = layer(big, small)
out = self.classifier(small)
return out
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : :, float, model size : :,M".format(k, k * 4 / 1024))
return s
if __name__ == '__main__':
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
import count_ops
import os
import time
os.environ['CUDA_VISIBLE_DEVICES'] = '1'
# 4003728896
print("Using standard convolution..........")
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=False)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : s'.format(time.time() - start))
print("Max memory allocated : M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
# 4543840768 4622491136
print("\nUsing bottleneck...........")
# torch.cuda.reset_max_memory_allocated(0)
a = torch.randn(6, 1, 32, 128, 128)
net = VBNet(bn='batch', bottleneck=True)
print(get_net_size(net))
print(count_ops(net, a))
net = net.cuda()
start = time.time()
for i in range(10):
a = torch.randn(6, 1, 32, 128, 128).cuda()
b = net(a)
b.sum().backward()
print('end : s'.format(time.time() - start))
print("Max memory allocated : M".format(torch.cuda.max_memory_allocated(0) / (1024.**2)))
【问题讨论】:
关于GPU内存,你should not usenvidia-smi
。改用torch.cuda.max_memory_allocated()
?
感谢您的建议,我使用代码来获取内存使用情况。但结果仍然表明,当使用bottleneck结构时,需要更多的内存来训练模型。
【参考方案1】:
我比较了三种卷积:标准卷积、瓶颈结构和可分离卷积,得到了性能结果:
对于标准卷积:
Total parameters : 13920 float, model size : 54.3750M
2.75 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.0517 s
Total iteration : 250
mean forward time : 0.0003 s
mean backward time : 0.0007 s
Max memory allocated : 120.1846 M
-------------------Test analyze----------------
total test time : 7.6900 s
Total iteration : 250
mean data time : 0.0305 s
mean forward time : 0.0003 s
Max memory allocated : 72.1826 M
对于瓶颈:
Total parameters : 7872 float, model size : 30.7500M
1.56 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.7080 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0016 s
Max memory allocated : 168.0767 M
-------------------Test analyze----------------
total test time : 8.8901 s
Total iteration : 250
mean data time : 0.0348 s
mean forward time : 0.0008 s
Max memory allocated : 72.0728 M
对于可分离卷积:
Total parameters : 1088 float, model size : 4.2500M
0.23 GFLOPs, for input size : (6, 16, 32, 32, 32)
-------------------Train analyze----------------
total train time : 8.3567 s
Total iteration : 250
mean forward time : 0.0009 s
mean backward time : 0.0014 s
Max memory allocated : 144.2021 M
-------------------Test analyze----------------
total test time : 7.9258 s
Total iteration : 250
mean data time : 0.0309 s
mean forward time : 0.0008 s
Max memory allocated : 72.1992 M
我们可以看到标准卷积比瓶颈结构和可分离卷积快两倍。而且它的内存消耗也不比其他两种方法大。
我猜原因可能是在训练中向前或向后训练时,具有更多卷积模块的瓶颈和可分离结构将使用更多内存来保存反向传播的输入,它们也会比标准卷积做更多的卷积操作。所以这两种结构无论是内存消耗还是速度都可以超越标准卷积。
可分离卷积较慢的另一个原因可能是 cuDNN 库不直接支持深度可分离卷积。
但是与标准卷积相比,这两种结构确实大大减小了模型尺寸,这对于移动设备非常有用。
代码如下:
三种不同的卷积。
import torch
import torch.nn as nn
import analyze_network_performance
import functools
Norm = nn.BatchNorm3d
class CBRSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(CBRSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class BottleNeckSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(BottleNeckSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=out_channels//N, kernel_size=1, stride=1),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels//N, kernel_size=kernel_size, stride=stride, padding=padding),
Norm(out_channels//N),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=out_channels//N, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
class GroupSeq(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size, stride=1, padding=1, N=2):
super(GroupSeq, self).__init__()
self.seq = nn.Sequential(
nn.Conv3d(in_channels=in_channels, out_channels=in_channels, groups=in_channels,
kernel_size=kernel_size, stride=stride, padding=padding),
Norm(in_channels),
nn.ReLU(inplace=True),
nn.Conv3d(in_channels=in_channels, out_channels=out_channels, kernel_size=1),
Norm(out_channels),
nn.ReLU(inplace=True),
)
def forward(self, input):
return self.seq(input)
def test_bottleneck():
data_gen = functools.partial(torch.randn, 6, 16, 32, 32, 32)
a = BottleNeckSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
b = CBRSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
c = GroupSeq(in_channels=16, out_channels=32, kernel_size=3, stride=1, padding=1)
print('BottleNeck Structure ....')
analyze_network_performance(a, data_gen, train_time=250, test_time=250)
print('\nStandard Convolution ....')
analyze_network_performance(b, data_gen, train_time=250, test_time=250)
print('\nSeparable Convolution ...')
analyze_network_performance(c, data_gen, train_time=250, test_time=250)
if __name__ == '__main__':
test_bottleneck()
analyze_network_performance
代码。
import time
# count_ops is taken from : https://github.com/1adrianb/pytorch-estimate-flops/blob/master/pthflops/ops.py
from ops import count_ops
import torch
import numpy as np
def get_net_size(net):
params = list(net.parameters())
k = 0
for i in params:
l = 1
for j in i.size():
l *= j
k = k + l
s = ("Total parameters : : float, model size : :.4fM".format(k, k * 4 / 1024))
return s
class Timer(object):
def __init__(self, verbose=False):
self.start_time = time.time()
self.verbose = verbose
self.duration = 0
def restart(self):
self.duration = self.start_time = time.time()
return self.duration
def stop(self):
return time.time() - self.start_time
def get_last_duration(self):
return self.duration
def __enter__(self):
self.restart()
def __exit__(self, exc_type, exc_val, exc_tb):
self.duration = self.stop()
if self.verbose:
print(':^.4f s'.format(self.stop()))
def to_cuda(data, device):
if device < 0:
return data
else:
return data.cuda(device)
def network_train_analyze(net, data_generate_func, cuda=0, train_time=10, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
backward_times = []
with t1:
for i in range(train_time):
a = to_cuda(data_generate_func(), cuda)
with t3:
b = net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t3.get_last_duration())
with t2:
b.sum().backward()
if forward_verbose:
print('backward : ', end='')
backward_times.append(t2.get_last_duration())
print('total train time : ', end='')
print("Total iteration : ".format(train_time))
print('mean forward time : :^.4f s'.format(np.mean(forward_times[1:])))
print('mean backward time : :^.4f s'.format(np.mean(backward_times[1:])))
if cuda >= 0:
print("Max memory allocated : :^.4f M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def network_test_analyze(net, data_generate_func, cuda=0, test_time=50, forward_verbose=False):
t1 = Timer(verbose=True)
t2 = Timer(verbose=forward_verbose)
t3 = Timer(verbose=False)
if cuda >= 0:
torch.cuda.reset_max_memory_allocated(cuda)
forward_times = []
data_times = []
with t1:
with torch.no_grad():
for i in range(test_time):
with t3:
a = to_cuda(data_generate_func(), cuda)
data_times.append(t3.get_last_duration())
with t2:
net(a)
if forward_verbose:
print('forward : ', end='')
forward_times.append(t2.get_last_duration())
print('total test time : ', end='')
print("Total iteration : ".format(test_time))
print('mean data time : :^.4f s'.format(np.mean(data_times[1:])))
print('mean forward time : :^.4f s'.format(np.mean(forward_times[1:])))
if cuda >= 0:
print("Max memory allocated : :^.4f M".format(torch.cuda.max_memory_allocated(cuda) / (1024.**2)))
def analyze_network_performance(net, data_generate_func, cuda=0, train_time=10, test_time=20, forward_verbose=False):
print('============ Analyzing network performance ==============')
print(get_net_size(net))
net = to_cuda(net, cuda)
a = data_generate_func()
a = to_cuda(a, cuda)
print(count_ops(net, a))
print('-------------------Train analyze----------------')
network_train_analyze(net, data_generate_func, cuda, train_time, forward_verbose)
print('-------------------Test analyze----------------')
network_test_analyze(net, data_generate_func, cuda, test_time, forward_verbose)
【讨论】:
以上是关于与标准卷积相比,为啥在训练网络时瓶颈结构更慢且占用更多内存?的主要内容,如果未能解决你的问题,请参考以下文章
为啥与 MSE 相比,使用 MAE 标准训练随机森林回归器如此缓慢?
与 CGContextDrawImage 相比,为啥 UIImageView 如此占用内存
与 laravel 在处理数百万条记录时使用块的普通查询构建器相比,Laravel Eloquents 和使用块是不是更慢?