如何清除 Colab Tensorflow TPU 内存
Posted
技术标签:
【中文标题】如何清除 Colab Tensorflow TPU 内存【英文标题】:How to clear Colab Tensorflow TPU memory 【发布时间】:2021-10-05 12:35:36 【问题描述】:我正在执行几个折叠的模型。每次折叠后,我都想清除 TPU 内存,以免出现 OOM 错误。
当前错误的完整跟踪。
ResourceExhaustedError Traceback (most recent call last)
<ipython-input-16-b7e0725f5c4d> in <module>()
1 tf.keras.backend.clear_session()
2 with config.strategy.scope():
----> 3 model = build_model(config.img_size, count = count_data_items(files_train)/config.batch_size)
25 frames
<ipython-input-9-5b219db28f69> in build_model(size, count)
1 def build_model(size, count=820):
2
----> 3 base_model = efn.EfficientNetB7(input_shape=(size,size,3),weights='imagenet',include_top=False)
4
5 model = tf.keras.Sequential([
/usr/local/lib/python3.7/dist-packages/efficientnet/__init__.py in wrapper(*args, **kwargs)
55 kwargs['models'] = tfkeras.models
56 kwargs['utils'] = tfkeras.utils
---> 57 return func(*args, **kwargs)
58
59 return wrapper
/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNetB7(include_top, weights, input_tensor, input_shape, pooling, classes, **kwargs)
604 input_tensor=input_tensor, input_shape=input_shape,
605 pooling=pooling, classes=classes,
--> 606 **kwargs
607 )
608
/usr/local/lib/python3.7/dist-packages/efficientnet/model.py in EfficientNet(width_coefficient, depth_coefficient, default_resolution, dropout_rate, drop_connect_rate, depth_divisor, blocks_args, model_name, include_top, weights, input_tensor, input_shape, pooling, classes, **kwargs)
348 use_bias=False,
349 kernel_initializer=CONV_KERNEL_INITIALIZER,
--> 350 name='stem_conv')(x)
351 x = layers.BatchNormalization(axis=bn_axis, name='stem_bn')(x)
352 x = layers.Activation(activation, name='stem_activation')(x)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in __call__(self, *args, **kwargs)
968 if _in_functional_construction_mode(self, inputs, args, kwargs, input_list):
969 return self._functional_construction_call(inputs, args, kwargs,
--> 970 input_list)
971
972 # Maintains info about the `Layer.call` stack.
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _functional_construction_call(self, inputs, args, kwargs, input_list)
1106 # Check input assumptions set after layer building, e.g. input shape.
1107 outputs = self._keras_tensor_symbolic_call(
-> 1108 inputs, input_masks, args, kwargs)
1109
1110 if outputs is None:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _keras_tensor_symbolic_call(self, inputs, input_masks, args, kwargs)
838 return nest.map_structure(keras_tensor.KerasTensor, output_signature)
839 else:
--> 840 return self._infer_output_signature(inputs, args, kwargs, input_masks)
841
842 def _infer_output_signature(self, inputs, args, kwargs, input_masks):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _infer_output_signature(self, inputs, args, kwargs, input_masks)
876 # overridden).
877 # TODO(kaftan): do we maybe_build here, or have we already done it?
--> 878 self._maybe_build(inputs)
879 inputs = self._maybe_cast_inputs(inputs)
880 outputs = call_fn(inputs, *args, **kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in _maybe_build(self, inputs)
2623 # operations.
2624 with tf_utils.maybe_init_scope(self):
-> 2625 self.build(input_shapes) # pylint:disable=not-callable
2626 # We must set also ensure that the layer is marked as built, and the build
2627 # shape is stored since user defined build functions may not be calling
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/layers/convolutional.py in build(self, input_shape)
202 constraint=self.kernel_constraint,
203 trainable=True,
--> 204 dtype=self.dtype)
205 if self.use_bias:
206 self.bias = self.add_weight(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer.py in add_weight(self, name, shape, dtype, initializer, regularizer, trainable, constraint, use_resource, synchronization, aggregation, **kwargs)
653 synchronization=synchronization,
654 aggregation=aggregation,
--> 655 caching_device=caching_device)
656 if regularizer is not None:
657 # TODO(fchollet): in the future, this should be handled at the
/usr/local/lib/python3.7/dist-packages/tensorflow/python/training/tracking/base.py in _add_variable_with_custom_getter(self, name, shape, dtype, initializer, getter, overwrite, **kwargs_for_getter)
813 dtype=dtype,
814 initializer=initializer,
--> 815 **kwargs_for_getter)
816
817 # If we set an initializer and the variable processed it, tracking will not
/usr/local/lib/python3.7/dist-packages/tensorflow/python/keras/engine/base_layer_utils.py in make_variable(name, shape, dtype, initializer, trainable, caching_device, validate_shape, constraint, use_resource, collections, synchronization, aggregation, partitioner)
137 synchronization=synchronization,
138 aggregation=aggregation,
--> 139 shape=variable_shape if variable_shape else None)
140
141
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
258 def __call__(cls, *args, **kwargs):
259 if cls is VariableV1:
--> 260 return cls._variable_v1_call(*args, **kwargs)
261 elif cls is Variable:
262 return cls._variable_v2_call(*args, **kwargs)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in _variable_v1_call(cls, initial_value, trainable, collections, validate_shape, caching_device, name, variable_def, dtype, expected_shape, import_scope, constraint, use_resource, synchronization, aggregation, shape)
219 synchronization=synchronization,
220 aggregation=aggregation,
--> 221 shape=shape)
222
223 def _variable_v2_call(cls,
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in getter(**kwargs)
65
66 def getter(**kwargs):
---> 67 return captured_getter(captured_previous, **kwargs)
68
69 return getter
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_lib.py in creator_with_resource_vars(next_creator, **kwargs)
2109 checkpoint_restore_uid = None
2110
-> 2111 created = self._create_variable(next_creator, **kwargs)
2112
2113 if checkpoint_restore_uid is not None:
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _create_variable(self, next_creator, **kwargs)
1167 self._container_strategy(), _real_mirrored_creator,
1168 distribute_utils.TPU_VARIABLE_CLASS_MAPPING,
-> 1169 distribute_utils.TPU_VARIABLE_POLICY_MAPPING, **kwargs)
1170
1171 def _gather_to_implementation(self, value, destinations, axis, options):
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/distribute_utils.py in create_mirrored_variable(strategy, real_mirrored_creator, class_mapping, policy_mapping, **kwargs)
304 # here.
305 with tape.stop_recording():
--> 306 value_list = real_mirrored_creator(**kwargs)
307 # MirroredVariable is recreated during saved_model loading, and its
308 # component variables (value_list) will have None initializer. We
/usr/local/lib/python3.7/dist-packages/tensorflow/python/distribute/tpu_strategy.py in _real_mirrored_creator(**kwargs)
1158
1159 with context.device_policy(context.DEVICE_PLACEMENT_SILENT):
-> 1160 v = next_creator(**kwargs)
1161
1162 assert not isinstance(v, tpu_values.TPUMirroredVariable)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in <lambda>(**kwargs)
197 shape=None):
198 """Call on Variable class. Useful to force the signature."""
--> 199 previous_getter = lambda **kwargs: default_variable_creator(None, **kwargs)
200 for _, getter in ops.get_default_graph()._variable_creator_stack: # pylint: disable=protected-access
201 previous_getter = _make_getter(getter, previous_getter)
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variable_scope.py in default_variable_creator(next_creator, **kwargs)
2624 synchronization=synchronization,
2625 aggregation=aggregation,
-> 2626 shape=shape)
2627 else:
2628 return variables.RefVariable(
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/variables.py in __call__(cls, *args, **kwargs)
262 return cls._variable_v2_call(*args, **kwargs)
263 else:
--> 264 return super(VariableMetaclass, cls).__call__(*args, **kwargs)
265
266
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in __init__(self, initial_value, trainable, collections, validate_shape, caching_device, name, dtype, variable_def, import_scope, constraint, distribute_strategy, synchronization, aggregation, shape)
1593 aggregation=aggregation,
1594 shape=shape,
-> 1595 distribute_strategy=distribute_strategy)
1596
1597 def _init_from_args(self,
/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/resource_variable_ops.py in _init_from_args(self, initial_value, trainable, collections, caching_device, name, dtype, constraint, synchronization, aggregation, distribute_strategy, shape)
1729 dtype=dtype)
1730 if shape is not None:
-> 1731 if not initial_value.shape.is_compatible_with(shape):
1732 raise ValueError(
1733 "The initial value's shape (%s) is not compatible with "
/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py in shape(self)
1196 # `_tensor_shape` is declared and defined in the definition of
1197 # `EagerTensor`, in C.
-> 1198 self._tensor_shape = tensor_shape.TensorShape(self._shape_tuple())
1199 except core._NotOkStatusException as e:
1200 six.raise_from(core._status_to_exception(e.code, e.message), None)
ResourceExhaustedError: Failed to allocate request for 18.0KiB (18432B) on device ordinal 0
【问题讨论】:
【参考方案1】:我个人不会尝试清除 TPU 内存。如果 Google Colab TPU 上存在 OOM,请使用更小的批量大小、更小的模型,或者使用内存是 Colab TPU 两倍的 Kaggle TPU。
【讨论】:
但是我们确实清除了 GPU 内存,所以我想要介于两者之间的东西。【参考方案2】:当我在 TPU 上执行超参数调整并希望在两次训练之间释放内存时,我使用 tf.tpu.experimental.initialize_tpu_system(hw_accelerator_handle)
。它会重置您的 TPU,同时保持与 TPU 的连接。在我的用例中,我每次都从头开始训练,可能它仍然适用于您的用例。
hw_accelerator_handle
是tf.distribute.cluster_resolver.TPUClusterResolver()
返回的对象
【讨论】:
以上是关于如何清除 Colab Tensorflow TPU 内存的主要内容,如果未能解决你的问题,请参考以下文章
如何将 Cloud TPU 与 Tensorflow Lite Model Maker 结合使用?