keras multiple_gpu_model 导致“无法腌制模块对象”错误
Posted
技术标签:
【中文标题】keras multiple_gpu_model 导致“无法腌制模块对象”错误【英文标题】:keras multiple_gpu_model causes "Can't pickle module object" error 【发布时间】:2018-10-18 23:09:06 【问题描述】:这是question 的后续行动。我正在尝试使用 8 个 GPU 进行训练,并且正在使用 Keras 的 multiple_gpu_model
。我指定了 128 的批量大小,它将在 8 个 GPU 中拆分,每个 GPU 有 16 个。现在,使用此配置,我收到以下错误:
Train on 6120 samples, validate on 323 samples
Epoch 1/100
6120/6120 [==============================] - 42s 7ms/step - loss: 0.0996 - mean_iou: 0.6919 - val_loss: 0.0969 - val_mean_iou: 0.7198
Epoch 00001: val_loss improved from inf to 0.09686, saving model to test.h5
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-16-00e92d5b765a> in <module>()
3 checkpointer = ModelCheckpoint('test.h5', verbose=1, save_best_only=True)
4 results = parallel_model.fit(X_train, Y_train, validation_split=0.05, batch_size = 128, verbose=1, epochs=100,
----> 5 callbacks=[earlystopper, checkpointer])
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, **kwargs)
1703 initial_epoch=initial_epoch,
1704 steps_per_epoch=steps_per_epoch,
-> 1705 validation_steps=validation_steps)
1706
1707 def evaluate(self, x=None, y=None,
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/engine/training.py in _fit_loop(self, f, ins, out_labels, batch_size, epochs, verbose, callbacks, val_f, val_ins, shuffle, callback_metrics, initial_epoch, steps_per_epoch, validation_steps)
1254 for l, o in zip(out_labels, val_outs):
1255 epoch_logs['val_' + l] = o
-> 1256 callbacks.on_epoch_end(epoch, epoch_logs)
1257 if callback_model.stop_training:
1258 break
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/callbacks.py in on_epoch_end(self, epoch, logs)
75 logs = logs or
76 for callback in self.callbacks:
---> 77 callback.on_epoch_end(epoch, logs)
78
79 def on_batch_begin(self, batch, logs=None):
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/callbacks.py in on_epoch_end(self, epoch, logs)
445 self.model.save_weights(filepath, overwrite=True)
446 else:
--> 447 self.model.save(filepath, overwrite=True)
448 else:
449 if self.verbose > 0:
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py in save(self, filepath, overwrite, include_optimizer)
2589 """
2590 from ..models import save_model
-> 2591 save_model(self, filepath, overwrite, include_optimizer)
2592
2593 def save_weights(self, filepath, overwrite=True):
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/models.py in save_model(model, filepath, overwrite, include_optimizer)
124 f.attrs['model_config'] = json.dumps(
125 'class_name': model.__class__.__name__,
--> 126 'config': model.get_config()
127 , default=get_json_type).encode('utf8')
128
~/anaconda/envs/dl/lib/python3.6/site-packages/keras/engine/topology.py in get_config(self)
2430 model_outputs.append([layer.name, new_node_index, tensor_index])
2431 config['output_layers'] = model_outputs
-> 2432 return copy.deepcopy(config)
2433
2434 @classmethod
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_dict(x, memo, deepcopy)
238 memo[id(x)] = y
239 for key, value in x.items():
--> 240 y[deepcopy(key, memo)] = deepcopy(value, memo)
241 return y
242 d[dict] = _deepcopy_dict
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_list(x, memo, deepcopy)
213 append = y.append
214 for a in x:
--> 215 append(deepcopy(a, memo))
216 return y
217 d[list] = _deepcopy_list
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_dict(x, memo, deepcopy)
238 memo[id(x)] = y
239 for key, value in x.items():
--> 240 y[deepcopy(key, memo)] = deepcopy(value, memo)
241 return y
242 d[dict] = _deepcopy_dict
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_dict(x, memo, deepcopy)
238 memo[id(x)] = y
239 for key, value in x.items():
--> 240 y[deepcopy(key, memo)] = deepcopy(value, memo)
241 return y
242 d[dict] = _deepcopy_dict
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_tuple(x, memo, deepcopy)
218
219 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 220 y = [deepcopy(a, memo) for a in x]
221 # We're not going to put the tuple in the memo, but it's still important we
222 # check for it, in case the tuple contains recursive mutable structures.
~/anaconda/envs/dl/lib/python3.6/copy.py in <listcomp>(.0)
218
219 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 220 y = [deepcopy(a, memo) for a in x]
221 # We're not going to put the tuple in the memo, but it's still important we
222 # check for it, in case the tuple contains recursive mutable structures.
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
148 copier = _deepcopy_dispatch.get(cls)
149 if copier:
--> 150 y = copier(x, memo)
151 else:
152 try:
~/anaconda/envs/dl/lib/python3.6/copy.py in _deepcopy_tuple(x, memo, deepcopy)
218
219 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 220 y = [deepcopy(a, memo) for a in x]
221 # We're not going to put the tuple in the memo, but it's still important we
222 # check for it, in case the tuple contains recursive mutable structures.
~/anaconda/envs/dl/lib/python3.6/copy.py in <listcomp>(.0)
218
219 def _deepcopy_tuple(x, memo, deepcopy=deepcopy):
--> 220 y = [deepcopy(a, memo) for a in x]
221 # We're not going to put the tuple in the memo, but it's still important we
222 # check for it, in case the tuple contains recursive mutable structures.
~/anaconda/envs/dl/lib/python3.6/copy.py in deepcopy(x, memo, _nil)
167 reductor = getattr(x, "__reduce_ex__", None)
168 if reductor:
--> 169 rv = reductor(4)
170 else:
171 reductor = getattr(x, "__reduce__", None)
TypeError: can't pickle module objects
当我指定批量大小为 256 时,网络甚至不会运行(请参阅其他链接的问题)。但是单个 GPU 能够处理 32 的批量大小。我无法指出这里出了什么问题以及如何解决这个错误。它只是批量大小吗?对我来说,这似乎更像是一个并行化问题。
【问题讨论】:
Can not save model using model.save following multi_gpu_model in Keras的可能重复self.model.save
处抛出错误(见this github issue 和the workaround)
【参考方案1】:
如果您在回调中使用 ModelCheckpoint 函数,则应在 ModelCheckpoint 函数中添加 para 'save_weights_only=True':
from keras.callbacks import ModelCheckpoint
callbacks_list = [ModelCheckpoint(top_weights_path, monitor='val_loss',
verbose=1, save_best_only=True, save_weights_only=True)]
希望有用
【讨论】:
以上是关于keras multiple_gpu_model 导致“无法腌制模块对象”错误的主要内容,如果未能解决你的问题,请参考以下文章