训练CNN模型图像分类期间的tensorflow NaN损失
Posted
技术标签:
【中文标题】训练CNN模型图像分类期间的tensorflow NaN损失【英文标题】:tensorflow NaN loss during training CNN model image classification 【发布时间】:2018-08-03 00:26:15 【问题描述】:我正在关注https://www.tensorflow.org/tutorials/layers 上的 CNN Mnist 教程,以完成我的个人图像分类任务。我的输入图像大小是 224 * 224 * 3 而不是教程中的 28 * 28,我只有 5 个类而不是 10 个。我阅读了有关此问题的以前的帖子,很多人指出要么学习率太大,要么使用cross_entropy_loss 可能是个问题,但我不确定这里是否是这种情况。
当我开始训练时,我立即得到这个 NaN 损失训练错误:
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "cnn_model.py", line 75, in <module>
main(sys.argv[1], sys.argv[2])
File "cnn_model.py", line 68, in main
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 612, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 505, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 842, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 798, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 960, in run
run_metadata=run_metadata))
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 477, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
下面是型号代码:
import tensorflow as tf
from helper import load_data_and_label
import cv2
import sys
import math
def cnn_model_fn(features, labels, mode):
#input layer
input_layer = tf.reshape(features['x'], [-1, 224, 224, 3])
#conv layer 1
conv1 = tf.layers.conv2d(inputs = input_layer, filters = 32, kernel_size
= [5,5], padding = 'same', activation = tf.nn.relu)
#pooling layer 1
pool1 = tf.layers.max_pooling2d(inputs = conv1, pool_size = [2,2], strides = 2)
#conv2 and pool2 layers
conv2 = tf.layers.conv2d(inputs = pool1, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs = conv2, pool_size = [2,2], strides = 2)
#conv3 and pool3 layers
conv3 = tf.layers.conv2d(inputs = pool2, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool3 = tf.layers.max_pooling2d(inputs = conv3, pool_size = [2,2], strides = 2)
#conv4 and pool4 layers
conv4 = tf.layers.conv2d(inputs = pool3, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool4 = tf.layers.max_pooling2d(inputs = conv4, pool_size = [2,2], strides = 2)
#conv5 and pool5 layers
conv5 = tf.layers.conv2d(inputs = pool4, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool5 = tf.layers.max_pooling2d(inputs = conv5, pool_size = [2,2], strides = 2)
#dense layer
pool5_flat = tf.reshape(pool5, [-1, 7 * 7 * 64])
dense = tf.layers.dense(inputs = pool5_flat, units = 1024, activation = tf.nn.relu)
dropout = tf.layers.dropout(inputs = dense, rate = 0.5,
training = mode == tf.estimator.ModeKeys.TRAIN)
#logits layer
logits = tf.layers.dense(inputs = dropout, units = 5)
predictions = "classes":tf.argmax(input = logits, axis = 1),
"prob": tf.nn.softmax(logits, name = 'softmax_tensor')
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode = mode, predictions = predictions)
#calculate loss
onehot_labels = tf.one_hot(indices = tf.cast(labels, tf.int32), depth = 5)
loss = tf.losses.softmax_cross_entropy(onehot_labels = onehot_labels, logits = logits)
#configure training operation
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op)
#evaluation metrics
eval_metrics_ops = "accuracy": tf.metrics.accuracy(labels = labels, predictions = predictions["classes"])
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metrics_ops = eval_metrics_ops)
def main(imagepath, labelpath):
train_data, train_labels, eval_data, eval_labels = load_data_and_label(imagepath, labelpath)
classifier = tf.estimator.Estimator(model_fn = cnn_model_fn, model_dir = "/tmp/retina_convnet_model")
tensors_to_log = "prob": "softmax_tensor"
logging_hook = tf.train.LoggingTensorHook(tensors = tensors_to_log, every_n_iter = 50)
#train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x = "x":train_data, y = train_labels,
batch_size = 32, num_epochs = None, shuffle = True)
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x = "x":eval_data, y = eval_labels, num_epochs = 1, shuffle = False)
eval_results = classifier.evaluate(input_fn = eval_input_fn)
print(eval_results)
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
非常感谢!任何帮助将不胜感激!
【问题讨论】:
【参考方案1】:您是否对图像进行了预处理?如果没有,那么也许尝试标准化您的辅助函数中的图像,看看是否有帮助。
【讨论】:
以上是关于训练CNN模型图像分类期间的tensorflow NaN损失的主要内容,如果未能解决你的问题,请参考以下文章
深度学习基于tensorflow的服装图像分类训练(数据集:Fashion-MNIST)