Tensorflow:一次处理一类

Posted

tags:

篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了Tensorflow:一次处理一类相关的知识,希望对你有一定的参考价值。

我正在尝试使用Tensorflow GPU(GTX 1060 Max-Q 6G​​B)训练10,000多个图像的数据集。因为我的数据集中的图像很大(512 x 424像素),所以出现MemoryError。

Traceback (most recent call last):
  File "train.py", line 33, in <module>
    data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)
  File "/home/nabeel/tf-realsense-gesture/dataset.py", line 103, in read_train_sets
    images, labels, img_names, cls = shuffle(images, labels, img_names, cls)  
  File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 403, in shuffle
    return resample(*arrays, **options)
  File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 327, in resample
    resampled_arrays = [safe_indexing(a, indices) for a in arrays]
  File "/home/nabeel/anaconda3/envs/tensorflow/lib/python2.7/site-packages/sklearn/utils/__init__.py", line 216, in safe_indexing
    return X.take(indices, axis=0)
MemoryError

我的代码的问题是我同时训练所有七个班级,这就是为什么我遇到内存错误。我想一次处理单个课程。

我试图在内部实现while / for循环,但是每次循环结束时,.meta文件都会被覆盖,并且只能在一个类上运行。有什么办法可以一次或一对一地训练多个班级?

train.py

batch_size = 1

# 7 classess for recognitions
#classes = ['up']
classes = ['up','down','left','right','forward','backward','none']
#classes = ['up','down','left','right','forward','backward','none']
num_classes = len(classes)

# 20% of the data will automatically be used for validation
validation_size = 0.2
img_size = 200
num_channels = 3
train_path='training_data'

# load all the training and validation images and labels into memory
data = dataset.read_train_sets(train_path, img_size, classes, validation_size=validation_size)

print("Complete reading input data. Will Now print a snippet of it")
print("Number of files in Training-set:\t\t".format(len(data.train.labels)))
print("Number of files in Validation-set:\t".format(len(data.valid.labels)))

session = tf.Session()
x = tf.placeholder(tf.float32, shape=[batch_size,img_size,img_size,num_channels], name='x')
# labels
y_true = tf.placeholder(tf.float32, shape=[None, num_classes], name='y_true')
y_true_cls = tf.argmax(y_true, dimension=1)

#Network graph params
filter_size_conv1 = 3 
num_filters_conv1 = 32

filter_size_conv2 = 3
num_filters_conv2 = 32

filter_size_conv3 = 3
num_filters_conv3 = 64

filter_size_conv4 = 3
num_filters_conv4 = 128

filter_size_conv5 = 3
num_filters_conv5 = 256

filter_size_conv6 = 3
num_filters_conv6 = 512

filter_size_conv7 = 3
num_filters_conv7= 1024

fc_layer_size = 2048

def create_weights(shape):
    return tf.Variable(tf.truncated_normal(shape, stddev=0.05))

def create_biases(size):
    return tf.Variable(tf.constant(0.05, shape=[size]))

def create_convolutional_layer(input,num_input_channels,conv_filter_size,num_filters):  

    # define the weights that will be trained
    weights = create_weights(shape=[conv_filter_size, conv_filter_size, num_input_channels, num_filters])
    # create biases
    biases = create_biases(num_filters)

    # Creat convolutional layer
    layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 1, 1, 1],padding='SAME')
    layer += biases

    # max-pooling  
    layer = tf.nn.max_pool(value=layer,
                            ksize=[1, 2, 2, 1],
                            strides=[1, 2, 2, 1],
                            padding='SAME')
    # Relu is the activation function
    layer = tf.nn.relu(layer)
    return layer

def create_flatten_layer(layer):
    layer_shape = layer.get_shape()

    num_features = layer_shape[1:4].num_elements()

    # Flatten the layer so reshape to num_features
    layer = tf.reshape(layer, [-1, num_features])

    return layer

def create_fc_layer(input,          
             num_inputs,    
             num_outputs,
             use_relu=True):

    # define trainable weights and biases.
    weights = create_weights(shape=[num_inputs, num_outputs])
    biases = create_biases(num_outputs)

    # Fully connected layer
    layer = tf.matmul(input, weights) + biases
    if use_relu:
        layer = tf.nn.relu(layer)

    return layer

layer_conv1 = create_convolutional_layer(input=x,num_input_channels=num_channels,conv_filter_size=filter_size_conv1,
        num_filters=num_filters_conv1)
layer_conv2 = create_convolutional_layer(input=layer_conv1,
               num_input_channels=num_filters_conv1,
               conv_filter_size=filter_size_conv2,
               num_filters=num_filters_conv2)

layer_conv3= create_convolutional_layer(input=layer_conv2,
               num_input_channels=num_filters_conv2,
               conv_filter_size=filter_size_conv3,
               num_filters=num_filters_conv3)
layer_conv4= create_convolutional_layer(input=layer_conv3,
               num_input_channels=num_filters_conv3,
               conv_filter_size=filter_size_conv4,
               num_filters=num_filters_conv4)
layer_conv5= create_convolutional_layer(input=layer_conv4,
               num_input_channels=num_filters_conv4,
               conv_filter_size=filter_size_conv5,
               num_filters=num_filters_conv5)
layer_conv6= create_convolutional_layer(input=layer_conv5,
               num_input_channels=num_filters_conv5,
               conv_filter_size=filter_size_conv6,
               num_filters=num_filters_conv6)

layer_conv7= create_convolutional_layer(input=layer_conv6,
               num_input_channels=num_filters_conv6,
               conv_filter_size=filter_size_conv7,
               num_filters=num_filters_conv7)

layer_flat = create_flatten_layer(layer_conv7)

layer_fc1 = create_fc_layer(input=layer_flat,num_inputs=layer_flat.get_shape()[1:4].num_elements(),num_outputs=fc_layer_size,
                     use_relu=True)

layer_fc2 = create_fc_layer(input=layer_fc1, num_inputs=fc_layer_size,num_outputs=num_classes, use_relu=False) 

y_pred = tf.nn.softmax(layer_fc2,name='y_pred')

y_pred_cls = tf.argmax(y_pred, dimension=1)
session.run(tf.global_variables_initializer())
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=layer_fc2,labels=y_true)
cost = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-4).minimize(cost)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))

session.run(tf.global_variables_initializer()) 

def show_progress(epoch, feed_dict_train, feed_dict_validate, val_loss):
    acc = session.run(accuracy, feed_dict=feed_dict_train)
    val_acc = session.run(accuracy, feed_dict=feed_dict_validate)
    msg = "Training Epoch 0 --- Training Accuracy: 1:>6.1%, Validation Accuracy: 2:>6.1%,  Validation Loss: 3:.3f"
    print(msg.format(epoch + 1, acc, val_acc, val_loss))

total_iterations = 0

saver = tf.train.Saver()
def train(num_iteration):
    global total_iterations

    for i in range(total_iterations,total_iterations + num_iteration):

        x_batch, y_true_batch, _, cls_batch = data.train.next_batch(batch_size)
        x_valid_batch, y_valid_batch, _, valid_cls_batch = data.valid.next_batch(batch_size)

        feed_dict_tr = x: x_batch,y_true: y_true_batch
        feed_dict_val = x: x_valid_batch,y_true: y_valid_batch

        session.run(optimizer, feed_dict=feed_dict_tr)

        if i % int(data.train.num_examples/batch_size) == 0: 
            val_loss = session.run(cost, feed_dict=feed_dict_val)
            epoch = int(i / int(data.train.num_examples/batch_size))    

            show_progress(epoch, feed_dict_tr, feed_dict_val, val_loss)
            saver.save(session, '/home/nabeel/tf-realsense-gesture/') 

    total_iterations += num_iteration

train(num_iteration=6000)
答案

由于您正面临[Out of Memory]在[[CNNs中的问题,您可以尝试以下步骤:

  1. 增加卷积层的步幅,即可以使用

    Sh = 1

Sw = 1来代替Sh = 2Sw = 2。这将,从而降低RAM Consumption。相同的代码如下所示:layer = tf.nn.conv2d(input=input,filter=weights,strides=[1, 2, 2, 1],padding='SAME')

验证您是否确实需要7 Convolutional Layers。您可以尝试使用Less Number of Convolutional Layers (4 or 5 or 6)并检查性能。因为每个具有一定数量的过滤器的卷积层都会增加内存使用率。

  • tf.float32替换为tf.float16,并且可以正常运行。

  • 使用Inception Module而不是Convolutional Layer

  • 以上是关于Tensorflow:一次处理一类的主要内容,如果未能解决你的问题,请参考以下文章

    资源 | 领英开源TonY:构建在Hadoop YARN上的TensorFlow框架

    tensorflow 有错误

    02.Tensorflow基础用法

    使用 tensorflow 我遇到了这样的错误

    因tensorflow版本问题造成的error

    [TensorFlow系列-1]:Tensorflow深度学习框架的详细安装过程