Nan 在摘要直方图中: deconv2/biases
Posted
技术标签:
【中文标题】Nan 在摘要直方图中: deconv2/biases【英文标题】:Nan in summary histogram for: deconv2/biases 【发布时间】:2018-06-22 18:23:14 【问题描述】:我的图像的原始尺寸是 3900 x 6000 x 3。我制作重叠的形状 (232024, 28, 28, 3) 块,然后制作大小为 1000 的批次。我有一个用于语义分割的 CNN 模型,如下所示:
def conv_layer(inputs, filters, kernel_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "conv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], input_shape[3], filters], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
conv2d = tf.nn.conv2d(input = tf.cast(inputs, dtype = tf.float32), filter = filter, strides = [1, strides, strides, 1], padding = padding)
activation = tf.nn.relu(conv2d + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def deconv_layer(inputs, filters, kernel_size, output_size, strides = 1, padding = "SAME", bias_constant = 0.0, name = "deconv"):
with tf.name_scope(name):
input_shape = inputs.shape.as_list()
deconv_shape = tf.stack([tf.shape(inputs)[0], output_size[0], output_size[1],filters])
filter_tensor = tf.truncated_normal([kernel_size[0], kernel_size[1], filters, input_shape[3]], dtype = tf.float32)
filter = tf.Variable(initial_value = filter_tensor, trainable = True, name = "kernel")
bias = tf.Variable(tf.constant(bias_constant, shape=[filters]), name="bias")
print("bias:")
print(bias)
conv2d_transpose = tf.nn.conv2d_transpose(value = tf.cast(inputs, dtype = tf.float32),
filter = filter,
strides = [1, strides, strides, 1],
output_shape=deconv_shape,
padding = padding)
activation = tf.nn.relu(conv2d_transpose + bias)
tf.summary.histogram("weights", filter)
tf.summary.histogram("biases", bias)
tf.summary.histogram("activations", activation)
return tf.cast(activation, dtype = tf.float16)
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [20, 50, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[5, 5],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
strides = 2,
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 3
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[2],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
name = "conv3")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 3
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[1][1], conv_sizes[1][2]],
name = "deconv3")
print(deconv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = deconv,
filters = conv_filters[0],
kernel_size = [5, 5],
bias_constant = bias_constant,
strides = 2,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [5, 5],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return deconv
epochs = 1000
learning_rate = 1e-50
image, label = tf.train.slice_input_producer([features, labels], shuffle = False)
BATCH_SIZE = 1000
THREAD_NUM = 5
MIN_AFTER_DEQUEUE = 10000
queue_capacity = MIN_AFTER_DEQUEUE + THREAD_NUM * BATCH_SIZE
image_batch, label_batch = tf.train.batch(tensors = [image, label],
batch_size = BATCH_SIZE,
capacity = queue_capacity,
num_threads = THREAD_NUM,
allow_smaller_final_batch = True)
output = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE)
#cost
with tf.name_scope("cross_entropy"):
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits = output, labels = label_batch)
cost = tf.reduce_mean( cross_entropy )
# return cost, optimizer, accr
tf.summary.scalar("xent", cost)
#optimizer
with tf.name_scope("optimizer"):
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# Accuracy
with tf.name_scope("accuracy"):
correct_prediction = tf.equal(tf.argmax(label_batch, 1), tf.argmax(output, 1))
accr = tf.reduce_mean(tf.cast(correct_prediction, tf.float16))
tf.summary.scalar("accuracy", accr)
merged_summary = tf.summary.merge_all()
# Session configs
config = tf.ConfigProto()
config.log_device_placement = True
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction=0.8
# Initialize session
sess = tf.Session(config=config)
sess.run(tf.global_variables_initializer())
coord = tf.train.Coordinator()
enqueue_threads = tf.train.start_queue_runners(sess = sess, coord = coord)
try:
for epoch in range(epochs):
if coord.should_stop():
break
epoch_loss = 0
train_loss = []; train_accuracy = []
s = sess.run(merged_summary)
writer.add_summary(s, epoch)
for batch in range(math.ceil(features.shape.as_list()[0]/BATCH_SIZE)):
_, sess_cost, sess_accuracy = sess.run([optimizer, cost, accr])
train_loss.append(sess_cost)
train_accuracy.append(sess_accuracy)
train_loss = np.mean(train_loss)
train_accuracy = np.mean(train_accuracy)
saver.save(sess, "./semantic_seg_model_1", global_step=epoch)
print ("[%02d/%02d] trainLoss: %.4f trainAcc: %.2f"
% (epoch + 1, epochs, sess_cost, sess_accuracy))
except Exception as e:
# Report exceptions to the coordinator.
coord.request_stop(e)
finally:
# Terminate as usual. It is safe to call `coord.request_stop()` twice.
coord.request_stop()
coord.join(enqueue_threads)
sess.close()
我在开始培训课程时遇到错误。错误如下:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
INFO:tensorflow:Error 报告给 Coordinator: , Nan 总结直方图:deconv2/biases [[节点:deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
由操作“deconv2/biases”引起,定义在:文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py”,第 193 行, 在 _run_module_as_main "main", mod_spec) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py",第 85 行, 在 _run_code 执行(代码,run_globals)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py”, 第 16 行,在 app.launch_new_instance() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py”, 第 658 行,在 launch_instance app.start() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", 第 478 行,开始 self.io_loop.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py”, 第 177 行,开始 super(ZMQIOLoop, self).start() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", 第 888 行,开始 handler_func(fd_obj, events) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", 第 277 行,在 null_wrapper 中 返回 fn(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 440 行,在 _handle_events 中 self._handle_recv() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 472 行,在 _handle_recv self._run_callback(callback, msg) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 414 行,在 _run_callback 回调(*args,**kwargs)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py”, 第 277 行,在 null_wrapper 中 返回 fn(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", 第 281 行,在调度程序中 return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", 第 232 行,在 dispatch_shell 中 处理程序(流,身份,味精)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”, 第 397 行,在 execute_request 中 user_expressions, allow_stdin) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", 第 208 行,在 do_execute 中 res = shell.run_cell(代码,store_history=store_history,silent=silent)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", 第 533 行,在 run_cell 中 返回 super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2728 行,在 run_cell 中 交互性=交互性,编译器=编译器,结果=结果)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2850 行,在 run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2910 行,在 run_code 中 exec(code_obj, self.user_global_ns, self.user_ns) 文件“”,第 1 行,在 输出 = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) 文件 "",第 107 行,在 semantic_seg_model 中 name = "deconv2") 文件 "",第 78 行,在 deconv_layer tf.summary.histogram("biases", bias) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", 第 192 行,在直方图中 tag=tag, values=values, name=scope) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", 第 187 行,在 _histogram_summary 中 "HistogramSummary", tag=tag, values=values, name=name) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", 第 787 行,在 _apply_op_helper op_def=op_def) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", 第 2956 行,在 create_op 中 op_def=op_def) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", 第 1470 行,在 init 中 self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError(参见上面的回溯):Nan 总结 直方图:deconv2/biases [[节点:deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
此 epoch 完成的迭代次数:0 -------------------------------------------------- ------------------------- InvalidArgumentError Traceback(最近调用 最后)c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _do_call(self, fn, *args) 1322 中尝试: -> 1323 return fn(*args) 1324 除了errors.OpError as e:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _run_fn(会话,feed_dict,fetch_list,target_list,选项, 运行元数据)1301 feed_dict, 获取列表,目标列表, -> 1302 状态,run_metadata) 1303
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\errors_impl.py 在退出(自我,type_arg,value_arg,traceback_arg) 第472章 --> 473 c_api.TF_GetCode(self.status.status)) 474 # 从内存中删除底层状态对象,否则它保持活动状态
InvalidArgumentError: Nan in summary histogram for: deconv2/biases [[节点:deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
在处理上述异常的过程中,又发生了一个异常:
InvalidArgumentError Traceback(最近调用 最后)在() 40 # 像往常一样终止。拨打
coord.request_stop()
两次是安全的。 41 coord.request_stop() ---> 42 coord.join(enqueue_threads) 43 44 sess.close()c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\training\coordinator.py 在加入(自我,线程,stop_grace_period_secs,ignore_live_threads) 第387章 第388章 --> 389 6.reraise(*self._exc_info_to_raise) 390 elif 落后者: 391 如果忽略_live_threads:
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\six.py 在再加注中(tp,价值,tb) 691 如果 value.traceback 不是 tb: 第692章 --> 693 提升值 694终于: 695 值 = 无
在 () 13 火车损失 = []; train_accuracy = [] 14 ---> 15 秒 = sess.run(merged_summary) 16 writer.add_summary(s, epoch) 17
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在运行中(self,fetches,feed_dict,options,run_metadata) 887尝试: 第888章 --> 889 run_metadata_ptr) 890 如果运行元数据: 第891章
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _run(self、handle、fetches、feed_dict、options、run_metadata) 1118 如果 final_fetches 或 final_targets 或(句柄和 feed_dict_tensor): 1119 结果 = self._do_run(handle, final_targets,final_fetches, -> 1120 feed_dict_tensor, options, run_metadata) 1121 else: 1122 results = []
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _do_run(self, handle, target_list, fetch_list, feed_dict, options, run_metadata)1315 如果句柄为无:1316 返回 self._do_call(_run_fn, self._session, feeds, fetches, 目标, -> 1317 选项,run_metadata) 1318 else: 1319 return self._do_call(_prun_fn, self._session, 处理、提要、获取)
c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\client\session.py 在 _do_call(self, fn, *args) 1334 中,除了 KeyError: 1335 经过 -> 1336 raise type(e)(node_def, op, message) 1337 1338 def _extend_graph(self):
InvalidArgumentError: Nan in summary histogram for: deconv2/biases [[节点:deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
由操作“deconv2/biases”引起,定义在:文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py”,第 193 行, 在 _run_module_as_main "main", mod_spec) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\runpy.py",第 85 行, 在 _run_code 执行(代码,run_globals)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel_launcher.py”, 第 16 行,在 app.launch_new_instance() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\traitlets\config\application.py”, 第 658 行,在 launch_instance app.start() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelapp.py", 第 478 行,开始 self.io_loop.start() 文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\ioloop.py”, 第 177 行,开始 super(ZMQIOLoop, self).start() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\ioloop.py", 第 888 行,开始 handler_func(fd_obj, events) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py", 第 277 行,在 null_wrapper 中 返回 fn(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 440 行,在 _handle_events 中 self._handle_recv() 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 472 行,在 _handle_recv self._run_callback(callback, msg) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\zmq\eventloop\zmqstream.py", 第 414 行,在 _run_callback 回调(*args,**kwargs)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tornado\stack_context.py”, 第 277 行,在 null_wrapper 中 返回 fn(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", 第 281 行,在调度程序中 return self.dispatch_shell(stream, msg) File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py", 第 232 行,在 dispatch_shell 中 处理程序(流,身份,味精)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\kernelbase.py”, 第 397 行,在 execute_request 中 user_expressions, allow_stdin) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\ipkernel.py", 第 208 行,在 do_execute 中 res = shell.run_cell(代码,store_history=store_history,silent=silent)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\ipykernel\zmqshell.py", 第 533 行,在 run_cell 中 返回 super(ZMQInteractiveShell, self).run_cell(*args, **kwargs) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2728 行,在 run_cell 中 交互性=交互性,编译器=编译器,结果=结果)文件“c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2850 行,在 run_ast_nodes if self.run_code(code, result): File "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\IPython\core\interactiveshell.py", 第 2910 行,在 run_code 中 exec(code_obj, self.user_global_ns, self.user_ns) 文件“”,第 1 行,在 输出 = semantic_seg_model(image_batch, tf.estimator.ModeKeys.TRAIN, BATCH_SIZE) 文件 "",第 107 行,在 semantic_seg_model 中 name = "deconv2") 文件 "",第 78 行,在 deconv_layer tf.summary.histogram("biases", bias) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\summary\summary.py", 第 192 行,在直方图中 tag=tag, values=values, name=scope) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\ops\gen_logging_ops.py", 第 187 行,在 _histogram_summary 中 "HistogramSummary", tag=tag, values=values, name=name) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\op_def_library.py", 第 787 行,在 _apply_op_helper op_def=op_def) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", 第 2956 行,在 create_op 中 op_def=op_def) 文件 "c:\users\fawad khalil\appdata\local\programs\python\python36\lib\site-packages\tensorflow\python\framework\ops.py", 第 1470 行,在 init 中 self._traceback = self._graph._extract_stack() # pylint: disable=protected-access
InvalidArgumentError(参见上面的回溯):Nan 总结 直方图:deconv2/biases [[节点:deconv2/biases = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](deconv2/biases/tag, deconv2/bias/read/_105)]] [[节点:batch/fifo_queue_Size/_91 = _Recvclient_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_37_batch/fifo_queue_Size", tensor_type=DT_INT32, _device="/job:localhost/replica:0/task:0/device:GPU:0"]]
github tensorflow issues 上的某个人建议尝试在模型发散时降低学习率,但这并没有帮助。另一个建议应该将 dtype 从 float16 更改为 float32,因为 float16 是有问题的。当我将数据的 dtype 更改为 float32 时,我在 python 日志控制台中收到以下错误:
[libprotobuf 错误 C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] 超过 2GB 的最大 protobuf 大小。 [libprotobuf 错误 C:\tf_jenkins\home\workspace\rel-win\M\windows-gpu\PY\36\cmake_build\protobuf\src\protobuf\src\google\protobuf\message_lite.cc:297] 超过 2GB 的最大 protobuf 大小。
当我尝试增加图像重叠块的宽度和高度时,也会发生同样的错误。我也尝试过减少 BATCH_SIZE 但没有帮助。
我有 4GB NVIDIA GeForce GTX 960M 专用显卡和 16GB RAM,配备 Intel Core i7-6700HQ CPU @ 2.60 GHz 2.60 GHz。 Python 版本为 3.6.4,Tensorflow 版本为 1.4,带 GPU。
更新 1: 更新模型:
def semantic_seg_model(features, mode, batch_size):
bias_constant = 0.1
conv_filters = [10, 25, 90]
conv_sizes = []
tf.summary.image('input', features, batch_size)
"""Model function for CNN."""
# Encoding starts here.
# Convolutional Layer 1
# Input: 100 x 100
conv = conv_layer(inputs=features,
filters=conv_filters[0],
kernel_size=[2, 2],
bias_constant = bias_constant,
name = "conv1")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Convolutional Layer 2
# Input: 100 x 100
conv = conv_layer(inputs = conv,
filters = conv_filters[1],
kernel_size = [2, 2],
bias_constant = bias_constant,
name = "conv2")
conv_sizes.append(conv.shape.as_list())
print(conv.shape)
# Deconvolution Layer 2
# Input: 100 x 100
deconv = deconv_layer(inputs = conv,
filters = conv_filters[0],
kernel_size = [2, 2],
bias_constant = bias_constant,
output_size = [conv_sizes[0][1], conv_sizes[0][2]],
name = "deconv2")
print(deconv.shape)
deconv = deconv_layer(inputs = deconv,
filters = 3,
kernel_size = [2, 2],
output_size = [features.shape.as_list()[1], features.shape.as_list()[2]],
bias_constant = bias_constant,
name = "deconv1")
print(deconv.shape)
return tf.cast(deconv, dtype = tf.float16)
【问题讨论】:
【参考方案1】:我怀疑问题在于您明显过度拟合;真正的证据是:
[01/1000] trainLoss: 0.0000 trainAcc: 1.00
这表示仅在一个 epoch 之后,您就完全适合训练数据;过拟合的明确迹象。因此,由此产生的NaN
可能是这个问题的一个不足为奇的效果,因为您现在几乎可以肯定已经学习了权重,这些权重将在它没有看到的数据或批次上返回0
或inf
(因为它是如此糟糕过拟合)。
要解决此问题,我建议您大幅简化模型,直到您得到不会很快过拟合的模型;例如,越来越小的 conv 和 deconv 层。然后,您可以开始重新构建这种复杂性。然后,您还会发现您可能希望构建一些 dropout 和/或批量标准化来处理这种过度拟合(注意:虽然很容易开始将这种复杂性添加到现有模型中,但我建议不要这样做;得到一些东西首先简单的工作,然后从那里增加复杂性......)。
最后说明:如果您按照上面的建议简化问题,您可能会有更好的minimal example 来分享;这应该可以让我们更快地查明您的问题。
【讨论】:
感谢您的宝贵时间。我已经更新了我的模型并包含在更新中。我减少了 conv 和 deconv 层的数量、过滤器的数量和内核大小,我也能够将学习率降低到 1e-4。现在模型工作正常,但您能否指导一下为什么 tf.float32 dtype 仍然无法正常工作并且在超过 2 GB 的 protobuf 限制时出现相同的错误? 我还想增加图像重叠补丁的大小,因为我认为它会给我带来更好的准确性。但由于超出限制错误而无法增加。有什么解决办法吗?建议使用 sharded=True 参数创建模型保护程序,但这不起作用。以上是关于Nan 在摘要直方图中: deconv2/biases的主要内容,如果未能解决你的问题,请参考以下文章
python 加载CSV \检查cat功能\显示NaN \显示摘要数据的数量