maskrcnn详细注解说明(超详细)
Posted tangjunjun
tags:
篇首语:本文由小常识网(cha138.com)小编为大家整理,主要介绍了maskrcnn详细注解说明(超详细)相关的知识,希望对你有一定的参考价值。
此代码是我对maskrcnn的一些修改,基本还原所有内容,但更加简洁,使代码更易解读。里面有很多注释,非常详细,可自己慢慢品味。
若有一些问题,欢迎指正与交流。
此代码为训练文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import scipy
# import os
# import random
# import datetime
# import re
# import math
# import logging
# from collections import OrderedDict
# import multiprocessing
# import numpy as np
import tensorflow as tf
import keras
# import keras.backend as K # keras中的后端backend及其相关函数
# import keras.layers as KL
# import keras.engine as KE
# import keras.models as KM
import math
import os
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import yaml
from PIL import Image
import random
# from mrcnn1 import utils, model as modellib, visualize
# from mrcnn1 import utils, model as modellib, visualize
import model as modellib
# from mrcnn1 import visualize
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion(‘2.0.8‘)
ROOT_DIR = os.getcwd() # 得到当前路径
sys.path.append(ROOT_DIR) # To find local version of the library
# Directory to save logs and trained models
MODEL_DIR = os.path.join(ROOT_DIR, "logs") # 在当前路径的logs文件路径
iter_num = 0
# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5") # 载入训练模型权重路径
class Config_config(object):
"""Base configuration class. For custom configurations, create a
sub-class that inherits from this one and override properties
that need to be changed.
"""
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
NUM_CLASSES = 1 + 4 # Override in sub-classes
PRE_NMS_LIMIT = 6000 # 判断在训练时候的提取层提取个数,若大于anchors则提取anchors个,否则相反
IMAGE_CHANNEL_COUNT = 3
# Name the configurations. For example, ‘COCO‘, ‘Experiment 3‘, ...etc.
# Useful if your code needs to do things differently depending on which
# experiment is running.
# NAME = "shapes" # Override in sub-classes
# GPU_COUNT = 1
# IMAGES_PER_GPU = 1
# Number of training steps per epoch
# This doesn‘t need to match the size of the training set. Tensorboard
# updates are saved at the end of each epoch, so setting this to a
# smaller number means getting more frequent TensorBoard updates.
# Validation stats are also calculated at each epoch end and they
# might take a while, so don‘t set this too small to avoid spending
# a lot of time on validation stats.
STEPS_PER_EPOCH = 5
# Number of validation steps to run at the end of every training epoch.
# A bigger number improves accuracy of validation stats, but slows
# down the training.
VALIDATION_STEPS = 50
# Backbone network architecture
# Supported values are: resnet50, resnet101.
# You can also provide a callable that should have the signature
# of model.resnet_graph. If you do so, you need to supply a callable
# to COMPUTE_BACKBONE_SHAPE as well
BACKBONE = "resnet101"
# Only useful if you supply a callable to BACKBONE. Should compute
# the shape of each layer of the FPN Pyramid.
# See model.compute_backbone_shapes
# COMPUTE_BACKBONE_SHAPE = None
# The strides of each layer of the FPN Pyramid. These values
# are based on a Resnet101 backbone.
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# Size of the fully-connected layers in the classification graph
FPN_CLASSIF_FC_LAYERS_SIZE = 1024
# Size of the top-down layers used to build the feature pyramid
TOP_DOWN_PYRAMID_SIZE = 256 # 定义rpn后每一层的通道数
# Number of classification classes (including background)
# Length of square anchor side in pixels
RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)
# Ratios of anchors at each cell (width/height)
# A value of 1 represents a square anchor, and 0.5 is a wide anchor
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1 then anchors are created for each cell in the backbone feature map.
# If 2, then anchors are created for every other cell, and so on.
RPN_ANCHOR_STRIDE = 1
# Non-max suppression threshold to filter RPN proposals.
# You can increase this during training to generate more propsals.
RPN_NMS_THRESHOLD = 0.7 # 小于该阈值被保留
# How many anchors per image to use for RPN training
RPN_TRAIN_ANCHORS_PER_IMAGE = 256 # rpn数据需要此值,rpn网络也需要次之
# ROIs kept after non-maximum supression (training and inference)
POST_NMS_ROIS_TRAINING = 2000 # 训练模型在rpn后提取层的数量
POST_NMS_ROIS_INFERENCE = 1000 # 测试模型在rpn后提取层的数量
# If enabled, resizes instance masks to a smaller size to reduce
# memory load. Recommended when using high-resolution images.
USE_MINI_MASK = False
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask
# Input image resizing
# Generally, use the "square" resizing mode for training and inferencing
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn‘t make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
# up before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
# crop: Picks random crops from the image. First, scales the image based
# on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
# size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
# IMAGE_MAX_DIM is not used in this mode.
IMAGE_RESIZE_MODE = "square"
# Minimum scaling ratio. Checked after MIN_IMAGE_DIM and can force further
# up scaling. For example, if set to 2 then images are scaled up to double
# the width and height, or more, even if MIN_IMAGE_DIM doesn‘t require it.
# Howver, in ‘square‘ mode, it can be overruled by IMAGE_MAX_DIM.
IMAGE_MIN_SCALE = 0
# Image mean (RGB)
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# Number of ROIs per image to feed to classifier/mask heads
# The Mask RCNN paper uses 512 but often the RPN doesn‘t generate
# enough positive proposals to fill this and keep a positive:negative
# ratio of 1:3. You can increase the number of proposals by adjusting
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 100 # target层
# Percent of positive ROIs used to train classifier/mask heads
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# Shape of output mask
# To change this you also need to change the neural network mask branch
MASK_SHAPE = [28, 28]
# Maximum number of ground truth instances to use in one image
MAX_GT_INSTANCES = 100
# Bounding box refinement standard deviation for RPN and final detections.
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Max number of final detections
DETECTION_MAX_INSTANCES = 100
# Minimum probability value to accept a detected instance
# ROIs below this threshold are skipped
DETECTION_MIN_CONFIDENCE = 0.7
# Non-maximum suppression threshold for detection
DETECTION_NMS_THRESHOLD = 0.3
# Learning rate and momentum
# The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
# weights to explode. Likely due to differences in optimzer
# implementation.
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9
# Weight decay regularization
WEIGHT_DECAY = 0.0001
# Loss weights for more precise optimization.
# Can be used for R-CNN training setup.
LOSS_WEIGHTS = {
"rpn_class_loss": 1.,
"rpn_bbox_loss": 1.,
"mrcnn_class_loss": 1.,
"mrcnn_bbox_loss": 1.,
"mrcnn_mask_loss": 1.
}
# Use RPN ROIs or externally generated ROIs for training
# Keep this True for most situations. Set to False if you want to train
# the head branches on ROI generated by code rather than the ROIs from
# the RPN. For example, to debug the classifier head without having to
# train the RPN.
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don‘t use). Set layer in training mode even when inferencing
TRAIN_BN = True # Defaulting to False since batch size is often small
# Gradient norm clipping
GRADIENT_CLIP_NORM = 5.0
batch_size=1
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
# self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
# Input image size
if self.IMAGE_RESIZE_MODE == "crop":
self.IMAGE_SHAPE = np.array([self.IMAGE_MIN_DIM, self.IMAGE_MIN_DIM, 3])
else:
self.IMAGE_SHAPE = np.array([self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print(" Configurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print(" ")
# 预测图片基本配置更改
class Predict_Config(Config_config):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
batch_size = 1
config = Config_config() # 基本配置建立实列
config.display() # 显示基本配置
import skimage.color
import skimage.io
import skimage.transform
class Dataset_data(object):
"""The base class for dataset classes.
To use it, create a new class that adds functions specific to the dataset
you want to use. For example:
class CatsAndDogsDataset(Dataset):
def load_cats_and_dogs(self):
...
def load_mask(self, image_id):
...
def image_reference(self, image_id):
...
See COCODataset and ShapesDataset as examples.
"""
def __init__(self, class_map=None):
self.image_ids = []
self.image_info = []
# Background is always the first class
self.class_info = [{ "id": 0, "name": "BG"}]
# self.source_class_ids = {"":[0],"shapes": [0,1,2,3,4]}
self.class_names = [] # 包含0背景名字
def add_class(self, class_id, class_name):
# assert "." not in source, "Source name cannot contain a dot"
# Does the class exist already?
for info in self.class_info:
if info["id"] == class_id:
# source.class_id combination already available, skip
return
# Add the class
self.class_info.append({
# "source": source,
"id": class_id,
"name": class_name,
})
def add_image(self, image_id, path, **kwargs):
image_info = {
"id": image_id,
# "source": source,
"path": path,
}
image_info.update(kwargs)
self.image_info.append(image_info)
def data_load_information(self, img_floder): # count表示transon文件的数量 img_floder 是transon文件路径
"""
该函数向class_info添加不良类的代码必须手动修改,
该函数主要保存类别信息,图片信息(如原始图片路径,
高宽及mask图片路径等)。
该函数只要输入文件名字,它会自动遍历所有文件,
并保存文件图片的信息。
"""
# Add classes
self.add_class( 1, "line_bulge") # 添加标签,这里只有一个不良 ###########################################################
self.add_class( 2, "dot_concave")
self.add_class( 3, "dot_bulge")
self.add_class( 4, "Irregular_concave")
img_file_list = os.listdir(img_floder) # 返回文件夹中包含的名字目录
count = len(img_file_list) # 有多少数量
id = 0
for sorce_path in img_file_list: # 遍历所有文件夹
yaml_path = os.path.join(img_floder + ‘‘ + sorce_path, ‘info.yaml‘) # label_names: - _background_ - NG
mask_path = os.path.join(img_floder + ‘‘ + sorce_path, ‘label.png‘)
img_path = os.path.join(img_floder + ‘‘ + sorce_path, ‘img.png‘)
cv_img = cv2.imdecode(np.fromfile(mask_path, dtype=np.uint8),
cv2.IMREAD_UNCHANGED) # np.fromfile以np.uint8读取文件 # cv2.imdecode缓存中读取数据,并解码成图像格式
self.add_image( image_id=id, path=img_path, width=cv_img.shape[1], height=cv_img.shape[0],
mask_path=mask_path, yaml_path=yaml_path)
id += 1
if id > count:
break
self.num_classes = len(self.class_info)
self.class_ids = np.arange(self.num_classes)
self.class_names = [c["name"] for c in self.class_info] # 保存图片类别,包含0
self.num_images = len(self.image_info) # 保存图片数量
self.image_ids = np.arange(self.num_images) # 根据图片数量产生图片编号
def load_image(self, image_id):
"""
该函数在数据产生时候使用
Load the specified image and return a [H,W,3] Numpy array.
"""
# Load image
image = skimage.io.imread(self.image_info[image_id][‘path‘])
# If grayscale. Convert to RGB for consistency.
if image.ndim != 3:
image = skimage.color.gray2rgb(image)
# If has an alpha channel, remove it for consistency
if image.shape[-1] == 4:
image = image[..., :3]
return image
def load_mask(self, image_id):
"""
该函数也是在数据产生中使用,主要根据图片序列,产生图片的mask,
将有mask的修改成值为1,其它为0,并返回每个mask对应的类数值,
返回mask与class_ids,其中mask为[w,h,object],
class_ids为[object],如[w,h,4]与[1,3,1,2]
"""
# global iter_num
info = self.image_info[image_id] # according image_id that belong int to choose image_info information
img = Image.open(info[‘mask_path‘]) # loading mask_path from label_image that original image handled have changed mask image with label
num_obj = np.max(img) # 取一个最大值,得到验证有多少个物体就会是多少,如这张图有3个mask则该值等于3
mask = np.zeros([info[‘height‘], info[‘width‘], num_obj], dtype=np.uint8)
count=1
for index in range(num_obj):
for i in range(info[‘width‘]):
for j in range(info[‘height‘]):
# info[‘width‘] 与info[‘height‘] 为label.png图像的宽度与高度
at_pixel = img.getpixel((i, j))
if at_pixel == index + 1:
mask[j, i, index] = 1 # 将有mask位置取1
mask = mask.astype(np.uint8)
# occlusion = np.logical_not(mask[:, :, -1]).astype(np.uint8) #
# for i in range(count - 2, -1, -1):
# mask[:, :, i] = mask[:, :, i] * occlusion
# occlusion = np.logical_and(occlusion, np.logical_not(mask[:, :, i]))
#
labels = self.from_yaml_get_class(image_id)
labels_form = []
num_classes=len(self.class_info) # 包含背景BG
for i in range(len(labels)): # search image_id label to add labels_form.append
for j in range(1,num_classes):
if labels[i].find(self.class_info[j]["name"]) != -1: # find()function checking if having line_bulge,
# if so ,return start index if not ,return -1.therefore judge return value equal -1
labels_form.append(self.class_info[j]["name"])
class_ids = np.array([self.class_names.index(s) for s in labels_form])
# 按照class_ids 选定图片,然后按照yaml文件的分类匹配到class中,并给出整数代表
return mask, class_ids.astype(np.int32)
# 解析labelme中得到的yaml文件,从而得到mask每一层对应的实例标签
def from_yaml_get_class(self, image_id):
‘‘‘
temp={‘label_names‘: [‘_background_‘, ‘11111‘, ‘22222‘, ‘3333‘]}
labels=[‘_background_‘, ‘11111‘, ‘22222‘, ‘3333‘]
labels[0]=[‘11111‘, ‘22222‘, ‘3333‘]
:param image_id:
:return:
‘‘‘
info = self.image_info[image_id]
with open(info[‘yaml_path‘]) as f:
temp = yaml.load(f.read(), Loader=yaml.FullLoader)
labels = temp[‘label_names‘]
del labels[0]
return labels
def generate_pyramid_anchors(self, scales, ratios, feature_shapes, feature_strides, anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape = feature_shapes[i]
feature_stride = feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 实际得到box的宽与高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 实际得到box坐标中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原图img/stride缩放后以anchor_stride为步长取像素,
# 一此作为中心点,而后乘以feature_stride(stride)将像素中心放回原图像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
def resize(self, image, output_shape, order=1, mode=‘constant‘, cval=0, clip=True,
preserve_range=False, anti_aliasing=False, anti_aliasing_sigma=None):
"""A wrapper for Scikit-Image resize().
Scikit-Image generates warnings on every call to resize() if it doesn‘t
receive the right parameters. The right parameters depend on the version
of skimage. This solves the problem by using different parameters per
version. And it provides a central place to control resizing defaults.
"""
if LooseVersion(skimage.__version__) >= LooseVersion("0.14"):
# New in 0.14: anti_aliasing. Default it to False for backward
# compatibility with skimage 0.13.
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range, anti_aliasing=anti_aliasing,
anti_aliasing_sigma=anti_aliasing_sigma)
else:
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range)
def resize_image(self,image, min_dim=None, max_dim=None, min_scale=None, mode="square"):
"""Resizes an image keeping the aspect ratio unchanged.
min_dim: if provided, resizes the image such that it‘s smaller dimension == min_dim
max_dim: if provided, ensures that the image longest side doesn‘t exceed this value.
min_scale: if provided, ensure that the image is scaled up by at least
this percent even if min_dim doesn‘t require it.
mode: Resizing mode.
none: No resizing. Return the image unchanged.
square: Resize and pad with zeros to get a square image of size [max_dim, max_dim].
pad64: Pads width and height with zeros to make them multiples of 64.
If min_dim or min_scale are provided, it scales the image up
before padding. max_dim is ignored in this mode.
The multiple of 64 is needed to ensure smooth scaling of feature
maps up and down the 6 levels of the FPN pyramid (2**6=64).
crop: Picks random crops from the image. First, scales the image based
on min_dim and min_scale, then picks a random crop of
size min_dim x min_dim. Can be used in training only.
max_dim is not used in this mode.
Returns:
image: the resized image
window: (y1, x1, y2, x2). If max_dim is provided, padding might
be inserted in the returned image. If so, this window is the
coordinates of the image part of the full image (excluding
the padding). The x2, y2 pixels are not included.
scale: The scale factor used to resize the image
padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
"""
# Keep track of image dtype and return results in the same dtype
image_dtype = image.dtype
# Default window (y1, x1, y2, x2) and default scale == 1.
h, w = image.shape[:2]
window = (0, 0, h, w)
scale = 1
padding = [(0, 0), (0, 0), (0, 0)]
if mode == "none":
return image, window, scale, padding
# Scale?
if min_dim:
# Scale up but not down
scale = max(1, min_dim / min(h, w)) # h, w是原始图片的高与宽
if min_scale and scale < min_scale: # min_scale是最小填充倍数的,至少要大于它
scale = min_scale
# Does it exceed max dim?
if max_dim and mode == "square":
image_max = max(h, w)
if round(image_max * scale) > max_dim: # 最终原图片最大边扩充不能超过最大max_dim维度,否则重新选择scale
scale = max_dim / image_max
# Resize image using bilinear interpolation
if scale != 1:
image = self.resize(image, (round(h * scale), round(w * scale)), preserve_range=True)
# 上一行代码对图像做了resize,那么会改变图像的尺寸,这是我不愿意看到的,我觉的这样会对缺陷特征有损失,
# 或者出现变异,因此小心这里的变化
# Need padding or cropping?
if mode == "square":
# Get new height and width
h, w = image.shape[:2] # 此时已经将原图按照scale进行了改变
top_pad = (max_dim - h) // 2
bottom_pad = max_dim - h - top_pad
left_pad = (max_dim - w) // 2
right_pad = max_dim - w - left_pad
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode=‘constant‘, constant_values=0) # 将改变的图片进行了填充
window = (top_pad, left_pad, h + top_pad, w + left_pad) # 保存经过resize后图片的真实大小
elif mode == "pad64":
h, w = image.shape[:2]
# Both sides must be divisible by 64
assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
# Height
if h % 64 > 0:
max_h = h - (h % 64) + 64
top_pad = (max_h - h) // 2
bottom_pad = max_h - h - top_pad
else:
top_pad = bottom_pad = 0
# Width
if w % 64 > 0:
max_w = w - (w % 64) + 64
left_pad = (max_w - w) // 2
right_pad = max_w - w - left_pad
else:
left_pad = right_pad = 0
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode=‘constant‘, constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
else:
raise Exception("Mode {} not supported".format(mode))
return image.astype(image_dtype), window, scale, padding
def resize_mask(self,mask, scale, padding):
# scale是输入图像的尺寸变化,padding是最大维度的背景填充,mask有效坐标对应原来输入的图像中
"""Resizes a mask using the given scale and padding.
Typically, you get the scale and padding from resize_image() to
ensure both, the image and the mask, are resized consistently.
scale: mask scaling factor
padding: Padding to add to the mask in the form
[(top, bottom), (left, right), (0, 0)]
"""
# Suppress warning from scipy 0.13.0, the output shape of zoom() is
# calculated with round() instead of int()
# with warnings.catch_warnings():
# warnings.simplefilter("ignore")
mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)
# if crop is not None:
# y, x, h, w = crop
# mask = mask[y:y + h, x:x + w]
# else:
mask = np.pad(mask, padding, mode=‘constant‘, constant_values=0)
return mask
def extract_bboxes(self,mask): # [[num_instances, (y1, x1, y2, x2)]]
# in a word,bbox proced by mask will contain all mask which value equal 1.
"""Compute bounding boxes from masks.
mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
Returns: bbox array [num_instances, (y1, x1, y2, x2)].
"""
boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
# the last dimension for mask (num_instances) is bbox for instance every picture
for i in range(mask.shape[-1]):
m = mask[:, :, i]
# Bounding box.
horizontal_indicies = np.where(np.any(m, axis=0))[0]
vertical_indicies = np.where(np.any(m, axis=1))[0]
if horizontal_indicies.shape[0]:
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
x2 += 1
y2 += 1
else:
# No mask for this instance. Might happen due to
# resizing or cropping. Set bbox to zeros
x1, x2, y1, y2 = 0, 0, 0, 0
boxes[i] = np.array([y1, x1, y2, x2])
return boxes.astype(np.int32)
def load_image_gt(self, config, image_id, augment=False, augmentation=None):
# Load image and mask
print("image_id : ", image_id) # 打印载入图片的序号
image = self.load_image(image_id)
mask, class_ids = self.load_mask(image_id)
original_shape = image.shape
image, window, scale, padding = self.resize_image(
image,
min_dim=config.IMAGE_MIN_DIM,
min_scale=config.IMAGE_MIN_SCALE,
max_dim=config.IMAGE_MAX_DIM,
mode=config.IMAGE_RESIZE_MODE)
mask = self.resize_mask(mask, scale, padding)
print(‘data_resize_image and resize_mask‘)
# Random horizontal flips.
# TODO: will be removed in a future update in favor of augmentation
if random.randint(0, 1):
image = np.fliplr(image)
mask = np.fliplr(mask)
# Augmentation
# This requires the imgaug lib (https://github.com/aleju/imgaug)
if augmentation:
import imgaug
# Augmenters that are safe to apply to masks
# Some, such as Affine, have settings that make them unsafe, so always
# test your augmentation on masks
MASK_AUGMENTERS = ["Sequential", "SomeOf", "OneOf", "Sometimes",
"Fliplr", "Flipud", "CropAndPad",
"Affine", "PiecewiseAffine"]
def hook(images, augmenter, parents, default):
"""Determines which augmenters to apply to masks."""
return augmenter.__class__.__name__ in MASK_AUGMENTERS
# Store shapes before augmentation to compare
image_shape = image.shape
mask_shape = mask.shape
# Make augmenters deterministic to apply similarly to images and masks
det = augmentation.to_deterministic()
image = det.augment_image(image)
# Change mask to np.uint8 because imgaug doesn‘t support np.bool
mask = det.augment_image(mask.astype(np.uint8), hooks=imgaug.HooksImages(activator=hook))
# Verify that shapes didn‘t change
assert image.shape == image_shape, "Augmentation shouldn‘t change image size"
assert mask.shape == mask_shape, "Augmentation shouldn‘t change mask size"
# Change mask back to bool
mask = mask.astype(np.bool)
# Note that some boxes might be all zeros if the corresponding mask got cropped out.
# and here is to filter them out
_idx = np.sum(mask, axis=(0, 1)) > 0
mask = mask[:, :, _idx]
class_ids = class_ids[_idx]
# Bounding boxes. Note that some boxes might be all zeros
# if the corresponding mask got cropped out.
# bbox: [num_instances, (y1, x1, y2, x2)]
bbox = self.extract_bboxes(mask)
# Active classes
# Different datasets have different classes, so track the
# classes supported in the dataset of this image.
active_class_ids = np.ones([self.num_classes], dtype=np.int32)
image_meta = np.array(
[image_id] + # size=1
list(original_shape) + # size=3
list(image.shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(active_class_ids) # size=num_classes
)
print(‘using model data‘)
return image, image_meta, class_ids, bbox, mask
def compute_overlaps(self,boxes1, boxes2):
# each value in boxes2 compute with all boxes1,and calling compute_iou function
# finally, value save in [number_boxes1,number_boxes2]
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
For better performance, pass the largest set first and the smaller second.
"""
# Areas of anchors and GT boxes
area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
# Compute overlaps to generate matrix [boxes1 count, boxes2 count]
# Each cell contains the IoU value.
overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0])) # building variables for overlaps to save
for i in range(overlaps.shape[1]):
box2 = boxes2[i]
y1 = np.maximum(box2[0], boxes1[:, 0])
y2 = np.minimum(box2[2], boxes1[:, 2])
x1 = np.maximum(box2[1], boxes1[:, 1])
x2 = np.minimum(box2[3], boxes1[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = area2[i] + area1[:] - intersection[:]
overlaps[:, i] = intersection / union
return overlaps
def build_rpn_targets(self, anchors, gt_class_ids, gt_boxes, config):
print(‘data_rpn_box‘)
"""Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
anchors: [num_anchors, (y1, x1, y2, x2)]
gt_class_ids: [num_gt_boxes] Integer class IDs.
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns:
rpn_match: [N] (int32) matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
"""
# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
# Filter out crowds from ground truth class IDs and boxes
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
# Compute overlaps with crowd boxes [anchors, crowds]
crowd_overlaps = self.compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
else:
# All anchors don‘t intersect a crowd
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
# Compute overlaps [num_anchors, num_gt_boxes]
overlaps = self.compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps a GT box with IoU >= 0.7 then it‘s positive.
# If an anchor overlaps a GT box with IoU < 0.3 then it‘s negative.
# Neutral anchors are those that don‘t match the conditions above,
# and they don‘t influence the loss function.
# However, don‘t keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
#
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. Skip boxes in crowd areas.
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. Set an anchor for each GT box (regardless of IoU value).
# If multiple anchors have the same IoU match all of them
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:, 0]
rpn_match[gt_iou_argmax] = 1
# 3. Set anchors with high overlap as positive.
rpn_match[anchor_iou_max >= 0.7] = 1
# Subsample to balance positive and negative anchors
# Don‘t let positives be more than half the anchors
ids = np.where(rpn_match == 1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
# Reset the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# Same for negative proposals
ids = np.where(rpn_match == -1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = np.where(rpn_match == 1)[0]
ix = 0 # index into rpn_bbox
# TODO: use box_refinement() rather than duplicating the code here
for i, a in zip(ids, anchors[ids]):
# Closest gt box (it might have IoU < 0.7)
gt = gt_boxes[anchor_iou_argmax[i]]
# Convert coordinates to center plus width/height.
# GT Box
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# Anchor
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# Compute the bbox refinement that the RPN should predict.
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# Normalize
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
def generate_random_rois(self, image_shape, count, gt_boxes):
"""Generates ROI proposals similar to what a region proposal network
would generate.
image_shape: [Height, Width, Depth]
count: Number of ROIs to generate
gt_class_ids: [N] Integer ground truth class IDs
gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
"""
# placeholder
rois = np.zeros((count, 4), dtype=np.int32)
# Generate random ROIs around GT boxes (90% of count)
rois_per_box = int(0.9 * count / gt_boxes.shape[0])
for i in range(gt_boxes.shape[0]):
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
h = gt_y2 - gt_y1
w = gt_x2 - gt_x1
# random boundaries
r_y1 = max(gt_y1 - h, 0)
r_y2 = min(gt_y2 + h, image_shape[0])
r_x1 = max(gt_x1 - w, 0)
r_x2 = min(gt_x2 + w, image_shape[1])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:rois_per_box]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:rois_per_box]
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
box_rois = np.hstack([y1, x1, y2, x2])
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
# Generate random ROIs anywhere in the image (10% of count)
remaining_count = count - (rois_per_box * gt_boxes.shape[0])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:remaining_count]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:remaining_count]
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
global_rois = np.hstack([y1, x1, y2, x2])
rois[-remaining_count:] = global_rois
return rois
def box_refinement(self,box, gt_box):
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
assumed to be outside the box.
"""
box = box.astype(np.float32)
gt_box = gt_box.astype(np.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = np.log(gt_height / height)
dw = np.log(gt_width / width)
return np.stack([dy, dx, dh, dw], axis=1)
def build_detection_targets(self, rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
"""Generate targets for training Stage 2 classifier and mask heads.
This is not used in normal training. It‘s useful for debugging or to train
the Mask RCNN heads without using the RPN head.
Inputs:
rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
gt_class_ids: [instance count] Integer class IDs
gt_boxes: [instance count, (y1, x1, y2, x2)]
gt_masks: [height, width, instance count] Ground truth masks. Can be full
size or mini-masks.
Returns:
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
to bbox boundaries and resized to neural network output size.
"""
assert rpn_rois.shape[0] > 0
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
gt_class_ids.dtype)
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
gt_boxes.dtype)
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
gt_masks.dtype)
# It‘s common to add GT Boxes to ROIs but we don‘t do that here because
# according to XinLei Chen‘s paper, it doesn‘t help.
# Trim empty padding in gt_boxes and gt_masks parts
instance_ids = np.where(gt_class_ids > 0)[0]
assert instance_ids.shape[0] > 0, "Image must contain instances."
gt_class_ids = gt_class_ids[instance_ids]
gt_boxes = gt_boxes[instance_ids]
gt_masks = gt_masks[:, :, instance_ids]
# Compute areas of ROIs and ground truth boxes.
rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) *
(rpn_rois[:, 3] - rpn_rois[:, 1])
gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) *
(gt_boxes[:, 3] - gt_boxes[:, 1])
# Compute overlaps [rpn_rois, gt_boxes]
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
for i in range(overlaps.shape[1]):
gt = gt_boxes[i]
overlaps[:, i] = self.compute_iou(
gt, rpn_rois, gt_box_area[i], rpn_roi_area)
# Assign ROIs to GT boxes
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
rpn_roi_iou_max = overlaps[np.arange(
overlaps.shape[0]), rpn_roi_iou_argmax]
# GT box assigned to each ROI
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
# Positive ROIs are those with >= 0.5 IoU with a GT box.
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
# TODO: To hard example mine or not to hard example mine, that‘s the question
# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
# Subsample ROIs. Aim for 33% foreground.
# FG
fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
if fg_ids.shape[0] > fg_roi_count:
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
else:
keep_fg_ids = fg_ids
# BG
remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
if bg_ids.shape[0] > remaining:
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
else:
keep_bg_ids = bg_ids
# Combine indices of ROIs to keep
keep = np.concatenate([keep_fg_ids, keep_bg_ids])
# Need more?
remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
if remaining > 0:
# Looks like we don‘t have enough samples to maintain the desired
# balance. Reduce requirements and fill in the rest. This is
# likely different from the Mask RCNN paper.
# There is a small chance we have neither fg nor bg samples.
if keep.shape[0] == 0:
# Pick bg regions with easier IoU threshold
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
assert bg_ids.shape[0] >= remaining
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
assert keep_bg_ids.shape[0] == remaining
keep = np.concatenate([keep, keep_bg_ids])
else:
# Fill the rest with repeated bg rois.
keep_extra_ids = np.random.choice(
keep_bg_ids, remaining, replace=True)
keep = np.concatenate([keep, keep_extra_ids])
assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE,
"keep doesn‘t match ROI batch size {}, {}".format(
keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
# Reset the gt boxes assigned to BG ROIs.
rpn_roi_gt_boxes[keep_bg_ids, :] = 0
rpn_roi_gt_class_ids[keep_bg_ids] = 0
# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
rois = rpn_rois[keep]
roi_gt_boxes = rpn_roi_gt_boxes[keep]
roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
roi_gt_assignment = rpn_roi_iou_argmax[keep]
# Class-aware bbox deltas. [y, x, log(h), log(w)]
bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
config.NUM_CLASSES, 4), dtype=np.float32)
pos_ids = np.where(roi_gt_class_ids > 0)[0]
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = self.box_refinement(
rois[pos_ids], roi_gt_boxes[pos_ids, :4])
# Normalize bbox refinements
bboxes /= config.BBOX_STD_DEV
# Generate class-specific target masks
masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
dtype=np.float32)
for i in pos_ids:
class_id = roi_gt_class_ids[i]
assert class_id > 0, "class id must be greater than 0"
gt_id = roi_gt_assignment[i]
class_mask = gt_masks[:, :, gt_id]
# if config.USE_MINI_MASK:
# # Create a mask placeholder, the size of the image
# placeholder = np.zeros(config.IMAGE_SHAPE[:2], dtype=bool)
# # GT box
# gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[gt_id]
# gt_w = gt_x2 - gt_x1
# gt_h = gt_y2 - gt_y1
# # Resize mini mask to size of GT box
# placeholder[gt_y1:gt_y2, gt_x1:gt_x2] =
# np.round(utils.resize(class_mask, (gt_h, gt_w))).astype(bool)
# # Place the mini batch in the placeholder
# class_mask = placeholder
# Pick part of the mask and resize it
y1, x1, y2, x2 = rois[i].astype(np.int32)
m = class_mask[y1:y2, x1:x2]
mask = self.resize(m, config.MASK_SHAPE)
masks[i, :, :, class_id] = mask
return rois, roi_gt_class_ids, bboxes, masks
def data_generator(self, config, shuffle=True, augment=False, augmentation=None,
random_rois=0, batch_size=1, detection_targets=False):
b = 0 # batch item index
image_index = -1
image_ids = np.copy(self.image_ids) # dataset.image_ids 运用了 @property
error_count = 0
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
backbone_shapes =
np.array([[int(math.ceil(config.IMAGE_SHAPE[0] / stride)),
int(math.ceil(config.IMAGE_SHAPE[1] / stride))] for stride in
config.BACKBONE_STRIDES]) # BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# compute_backbone_shapes(config, config.IMAGE_SHAPE) # (5,2) # [4, 8, 16, 32, 64]
anchors = self.generate_pyramid_anchors(config.RPN_ANCHOR_SCALES, # (8, 16, 32, 64, 128)
config.RPN_ANCHOR_RATIOS, # [0.5, 1, 2]
backbone_shapes, # image_shape / [4, 8, 16, 32, 64] is five rows 2 cols
config.BACKBONE_STRIDES, # [4, 8, 16, 32, 64]
config.RPN_ANCHOR_STRIDE) # =1
print(‘data_class_data_anchors‘)
# 【n,4】
# 得到的anchor数量为 每个scale分别是3*(image_shape/4)**2,3*(image_shape/8)**2,3*(image_shape/16)**2,
# 3*(image_shape/4)**2,3*(image_shape/64)**2,
# Keras requires a generator to run indefinitely.
while True:
try:
# Increment index to pick next image. Shuffle if at the start of an epoch.
image_index = (image_index + 1) % len(image_ids)
if shuffle and image_index == 0:
np.random.shuffle(image_ids)
# Get GT bounding boxes and masks for image.
image_id = image_ids[image_index]
image, image_meta, gt_class_ids, gt_boxes, gt_masks =
self.load_image_gt(config, image_id, augment=augment,
augmentation=augmentation)
# Skip images that have no instances. This can happen in cases
# where we train on a subset of classes and the image doesn‘t
# have any of the classes we care about.
if not np.any(gt_class_ids > 0):
continue
# RPN Targets
rpn_match, rpn_bbox = self.build_rpn_targets(anchors, gt_class_ids, gt_boxes, config)
# Mask R-CNN Targets
if random_rois:
rpn_rois = self.generate_random_rois(image.shape, random_rois, gt_boxes)
if detection_targets:
rois, mrcnn_class_ids, mrcnn_bbox, mrcnn_mask =
self.build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config)
# Init batch arrays
if b == 0:
batch_image_meta = np.zeros((batch_size,) + image_meta.shape, dtype=image_meta.dtype)
batch_rpn_match = np.zeros([batch_size, anchors.shape[0], 1], dtype=rpn_match.dtype)
batch_rpn_bbox = np.zeros([batch_size, config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4], dtype=rpn_bbox.dtype)
batch_images = np.zeros((batch_size,) + image.shape, dtype=np.float32)
batch_gt_class_ids = np.zeros((batch_size, config.MAX_GT_INSTANCES), dtype=np.int32)
batch_gt_boxes = np.zeros((batch_size, config.MAX_GT_INSTANCES, 4), dtype=np.int32)
batch_gt_masks = np.zeros(
(batch_size, gt_masks.shape[0], gt_masks.shape[1], config.MAX_GT_INSTANCES),
dtype=gt_masks.dtype)
if random_rois:
batch_rpn_rois = np.zeros((batch_size, rpn_rois.shape[0], 4), dtype=rpn_rois.dtype)
if detection_targets:
batch_rois = np.zeros((batch_size,) + rois.shape, dtype=rois.dtype)
batch_mrcnn_class_ids = np.zeros((batch_size,) + mrcnn_class_ids.shape,
dtype=mrcnn_class_ids.dtype)
batch_mrcnn_bbox = np.zeros((batch_size,) + mrcnn_bbox.shape, dtype=mrcnn_bbox.dtype)
batch_mrcnn_mask = np.zeros((batch_size,) + mrcnn_mask.shape, dtype=mrcnn_mask.dtype)
# If more instances than fits in the array, sub-sample from them.
if gt_boxes.shape[0] > config.MAX_GT_INSTANCES:
ids = np.random.choice(np.arange(gt_boxes.shape[0]), config.MAX_GT_INSTANCES, replace=False)
gt_class_ids = gt_class_ids[ids]
gt_boxes = gt_boxes[ids]
gt_masks = gt_masks[:, :, ids]
# Add to batch
batch_image_meta[b] = image_meta
batch_rpn_match[b] = rpn_match[:, np.newaxis]
batch_rpn_bbox[b] = rpn_bbox
batch_images[b] = image.astype(np.float32) - config.MEAN_PIXEL
batch_gt_class_ids[b, :gt_class_ids.shape[0]] = gt_class_ids
batch_gt_boxes[b, :gt_boxes.shape[0]] = gt_boxes
batch_gt_masks[b, :, :, :gt_masks.shape[-1]] = gt_masks
if random_rois:
batch_rpn_rois[b] = rpn_rois
if detection_targets:
batch_rois[b] = rois
batch_mrcnn_class_ids[b] = mrcnn_class_ids
batch_mrcnn_bbox[b] = mrcnn_bbox
batch_mrcnn_mask[b] = mrcnn_mask
b += 1
# Batch full?
if b >= batch_size:
inputs = [batch_images, batch_image_meta, batch_rpn_match, batch_rpn_bbox,
batch_gt_class_ids, batch_gt_boxes, batch_gt_masks]
outputs = []
if random_rois:
inputs.extend([batch_rpn_rois])
if detection_targets:
inputs.extend([batch_rois])
# Keras requires that output and targets have the same number of dimensions
batch_mrcnn_class_ids = np.expand_dims(
batch_mrcnn_class_ids, -1)
outputs.extend(
[batch_mrcnn_class_ids, batch_mrcnn_bbox, batch_mrcnn_mask])
print(‘data_load_finish‘)
yield inputs , outputs
# start a new batch
b = 0
except:
raise Exception("not pass")
‘‘‘
可能会抛出异常,属于正常,因为出现生成器销毁而出现的。
Exception ignored in: <generator object Dataset_data.data_generator at 0x000002002D40BB48>
Traceback (most recent call last):
File "C:/Users/51102/Desktop/MASKRCNN_tangjun/Mask_RCNN-master/train_demo.py", line 1249, in data_generator
raise Exception("not pass")
Exception: not pass
‘‘‘
def train_model():
img_floder =‘C:Users51102Desktopmaskrcnn(tangjun)1021‘ ####################################################################################################
dataset_train = Dataset_data()
dataset_train.data_load_information(img_floder)
model = modellib.MaskRCNN(mode="training", config=config)
COCO_MODEL_PATH=‘C:Users51102Desktopmaskrcnn(tangjun)mask_rcnn_shapes_0002.h5‘
model.load_weights(COCO_MODEL_PATH, by_name=True,
exclude=["mrcnn_class_logits", "mrcnn_bbox_fc",
"mrcnn_bbox", "mrcnn_mask"])
# 产生数据
train_generator = dataset_train.data_generator(config, shuffle=True,
augmentation=None,
batch_size=config.batch_size)
model.train(train_generator,
learning_rate=config.LEARNING_RATE,
epochs=4,
layers=‘heads‘)
# Fine tune all layers
# Passing layers="all" trains all layers. You can also
# pass a regular expression to select which layers to
# train by name pattern.
# model.train(dataset_train, dataset_train,
# learning_rate=config.LEARNING_RATE / 10,
# epochs=3,
# layers="all")
from skimage.measure import find_contours
import matplotlib.pyplot as plt
from matplotlib import patches
from matplotlib.patches import Polygon
import colorsys
def random_colors(N, bright=True):
"""
Generate random colors.
To get visually distinct colors, generate them in HSV space then
convert to RGB.
"""
brightness = 1.0 if bright else 0.7
hsv = [(i / N, 1, brightness) for i in range(N)]
colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))
random.shuffle(colors)
return colors
def apply_mask(image, mask, color, alpha=0.5):
"""Apply the given mask to the image.
"""
for c in range(3):
image[:, :, c] = np.where(mask == 1,
image[:, :, c] *
(1 - alpha) + alpha * color[c] * 255,
image[:, :, c])
return image
def display_instances(image, boxes, masks, class_ids, class_names,
scores=None, title="",
figsize=(16, 16), ax=None,
show_mask=True, show_bbox=True,
colors=None, captions=None):
"""
boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
masks: [height, width, num_instances]
class_ids: [num_instances]
class_names: list of class names of the dataset
scores: (optional) confidence scores for each box
title: (optional) Figure title
show_mask, show_bbox: To show masks and bounding boxes or not
figsize: (optional) the size of the image
colors: (optional) An array or colors to use with each object
captions: (optional) A list of strings to use as captions for each object
"""
# Number of instances
N = boxes.shape[0]
if not N:
print(" *** No instances to display *** ")
else:
assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]
# If no axis is passed, create one and automatically call show()
auto_show = False
if not ax:
_, ax = plt.subplots(1, figsize=figsize)
auto_show = True
# Generate random colors
colors = colors or random_colors(N)
# Show area outside image boundaries.
height, width = image.shape[:2]
ax.set_ylim(height + 10, -10)
ax.set_xlim(-10, width + 10)
ax.axis(‘off‘)
ax.set_title(title)
masked_image = image.astype(np.uint32).copy()
for i in range(N):
color = colors[i]
# Bounding box
if not np.any(boxes[i]):
# Skip this instance. Has no bbox. Likely lost in image cropping.
continue
y1, x1, y2, x2 = boxes[i]
# cv.rectangle(masked_image, (y1[0],x1[0]), (y2[0],x2[0]), (0, 250, 0), 2) # 自己添加代码
if show_bbox:
p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
alpha=0.7, linestyle="dashed",
edgecolor=color, facecolor=‘none‘)
ax.add_patch(p)
# Label
if not captions:
class_id = class_ids[i]
score = scores[i] if scores is not None else None
label = class_names[class_id]
caption = "{} {:.3f}".format(label, score) if score else label
else:
caption = captions[i]
ax.text(x1, y1 + 8, caption,
color=‘w‘, size=11, backgroundcolor="none")
# Mask
mask = masks[:, :, i]
if show_mask:
masked_image = apply_mask(masked_image, mask, color)
# Mask Polygon
# Pad to ensure proper polygons for masks that touch image edges.
padded_mask = np.zeros(
(mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)
padded_mask[1:-1, 1:-1] = mask
contours = find_contours(padded_mask, 0.5)
for verts in contours:
# Subtract the padding and flip (y, x) to (x, y)
verts = np.fliplr(verts) - 1
p = Polygon(verts, facecolor="none", edgecolor=color)
ax.add_patch(p)
ax.imshow(masked_image.astype(np.uint8))
if auto_show:
plt.show()
return masked_image
def predict():
import skimage.io
# Create models in training mode
config = Predict_Config()
config.display()
model = modellib.MaskRCNN(mode="inference", config=config)
# model_path = ‘C:Users51102Desktopmask-rcnn-meMASKRCNN_myselfMask_RCNN-masterlogsshapes20200216T1602mask_rcnn_shapes_0002.h5‘
model_path = ‘C:Users51102Desktopmaskrcnn(tangjun)log 4.h5‘
# Load trained weights (fill in path to trained weights here)
assert model_path != "", "Provide path to trained weights"
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)
class_names = [‘BG‘, ‘line_bulge‘,‘dot_concave‘,‘dot_bulge‘,‘Irregular_concave‘]
# file_names =‘C:Users51102Desktopmaskrcnn(tangjun)1.jpg‘
file_names=‘C:Users51102Desktopmaskrcnn(tangjun)3.bmp‘
# image = skimage.io.imread(os.path.join(IMAGE_DIR, random.choice(file_names)))
image = skimage.io.imread(file_names)
image=image[:, :, 0:3]
print(‘image=‘, image.shape)
# Run detection
results = model.detect([image], log_print=1)
‘‘‘
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
‘‘‘
# Visualize results
r = results[0]
print(‘r=‘,r)
display_instances(image, r[‘rois‘], r[‘masks‘], r[‘class_ids‘], class_names, r[‘scores‘])
if __name__ == "__main__":
train_model()
# predict()
此代码为模型文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import skimage.color
import skimage.io
import skimage.transform
# import urllib.request
# import shutil
# import warnings
# from distutils.version import LooseVersion
import scipy
# import os
# import random
# import datetime
import re
import math
# import logging
# from collections import OrderedDict
# import multiprocessing
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K # keras中的后端backend及其相关函数
import keras.layers as KL
import keras.engine as KE
import keras.models as KM
# from mrcnn1 import utils
# Requires TensorFlow 1.3+ and Keras 2.0.8+.
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion(‘2.0.8‘)
############################################################
# Utility Functions
############################################################
def batch_slice(inputs, graph_fn, batch_size, names=None):
"""Splits inputs into slices and feeds each slice to a copy of the given
computation graph and then combines the results. It allows you to run a
graph on a batch of inputs even if the graph is written to support one
instance only.
inputs: list of tensors. All must have the same first dimension length
graph_fn: A function that returns a TF tensor that‘s part of a graph.
batch_size: number of slices to divide the data into.
names: If provided, assigns names to the resulting tensors.
"""
if not isinstance(inputs, list):
inputs = [inputs]
outputs = []
for i in range(batch_size):
inputs_slice = [x[i] for x in inputs] # [scores[i], xi[i]]的值
output_slice = graph_fn(*inputs_slice)
if not isinstance(output_slice, (tuple, list)):
output_slice = [output_slice] # 将其转换为列表
outputs.append(output_slice)
# Change outputs from a list of slices where each is
# a list of outputs to a list of outputs and each has
# a list of slices
outputs = list(zip(*outputs))
if names is None:
names = [None] * len(outputs) # 将其变成对应的输出类
result = [tf.stack(o, axis=0, name=n) for o, n in zip(outputs, names)]
if len(result) == 1:
result = result[0]
return result
def norm_boxes(boxes, shape):
"""Converts boxes from pixel coordinates to normalized coordinates.
boxes: [N, (y1, x1, y2, x2)] in pixel coordinates
shape: [..., (height, width)] in pixels
Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
coordinates it‘s inside the box.
Returns:
[N, (y1, x1, y2, x2)] in normalized coordinates
"""
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.divide((boxes - shift), scale).astype(np.float32)
def denorm_boxes(boxes, shape):
h, w = shape
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
return np.around(np.multiply(boxes, scale) + shift).astype(np.int32)
def compute_iou(box, boxes, box_area, boxes_area):
# one box compare multiple boxes , we will get number depending on boxes,and return list.
"""Calculates IoU of the given box with the array of the given boxes.
box: 1D vector [y1, x1, y2, x2]
boxes: [boxes_count, (y1, x1, y2, x2)]
box_area: float. the area of ‘box‘
boxes_area: array of length boxes_count.
Note: the areas are passed in rather than calculated here for
efficiency. Calculate once in the caller to avoid duplicate work.
"""
# Calculate intersection areas
y1 = np.maximum(box[0], boxes[:, 0])
y2 = np.minimum(box[2], boxes[:, 2])
x1 = np.maximum(box[1], boxes[:, 1])
x2 = np.minimum(box[3], boxes[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = box_area + boxes_area[:] - intersection[:]
iou = intersection / union
return iou
def log(text, array=None):
"""Prints a text message. And, optionally, if a Numpy array is provided it
prints it‘s shape, min, and max values.
"""
if array is not None:
text = text.ljust(25)
text += ("shape: {:20} ".format(str(array.shape)))
if array.size:
text += ("min: {:10.5f} max: {:10.5f}".format(array.min(),array.max()))
else:
text += ("min: {:10} max: {:10}".format("",""))
text += " {}".format(array.dtype)
print(text)
class BatchNorm(KL.BatchNormalization):
"""Extends the Keras BatchNormalization class to allow a central place
to make changes if needed.
Batch normalization has a negative effect on training if batches are small
so this layer is often frozen (via setting in Config class) and functions
as linear layer.
"""
def call(self, inputs, training=None):
"""
Note about training values:
None: Train BN layers. This is the normal mode
False: Freeze BN layers. Good when batch size is small
True: (don‘t use). Set layer in training mode even when making inferences
"""
return super(self.__class__, self).call(inputs, training=training)
def compute_backbone_shapes(config, image_shape):
"""Computes the width and height of each stage of the backbone network.
Returns:
[N, (height, width)]. Where N is the number of stages
"""
if callable(config.BACKBONE): # 检测对象是否可被调用 # BACKBONE = "resnet101"
return config.COMPUTE_BACKBONE_SHAPE(image_shape)
# Currently supports ResNet only
assert config.BACKBONE in ["resnet50", "resnet101"]
return np.array([[int(math.ceil(image_shape[0] / stride)), int(math.ceil(image_shape[1] / stride))] for stride in config.BACKBONE_STRIDES]) # BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# [4, 8, 16, 32, 64] # 会出现 [5,2]
############################################################
# Resnet Graph
############################################################
# Code adopted from:
def identity_block(input_tensor, kernel_size, filters, stage, block, use_bias=True, train_bn=True):
"""The identity_block is the block that has no conv layer at shortcut
# Arguments
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the nb_filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: ‘a‘,‘b‘..., current block label, used for generating layer names
use_bias: Boolean. To use or not use a bias in conv layers.
train_bn: Boolean. Train or freeze Batch Norm layers
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = ‘res‘ + str(stage) + block + ‘_branch‘
bn_name_base = ‘bn‘ + str(stage) + block + ‘_branch‘
x = KL.Conv2D(nb_filter1, (1, 1), name=conv_name_base + ‘2a‘, use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + ‘2a‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding=‘same‘,name=conv_name_base + ‘2b‘, use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + ‘2b‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + ‘2c‘, use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + ‘2c‘)(x, training=train_bn)
x = KL.Add()([x, input_tensor])
x = KL.Activation(‘relu‘, name=‘res‘ + str(stage) + block + ‘_out‘)(x)
return x
def conv_block(input_tensor, kernel_size, filters, stage, block, strides=(2, 2), use_bias=True, train_bn=True):
"""conv_block is the block that has a conv layer at shortcut
# Arguments
input_tensor: input tensor
kernel_size: default 3, the kernel size of middle conv layer at main path
filters: list of integers, the nb_filters of 3 conv layer at main path
stage: integer, current stage label, used for generating layer names
block: ‘a‘,‘b‘..., current block label, used for generating layer names
use_bias: Boolean. To use or not use a bias in conv layers.
train_bn: Boolean. Train or freeze Batch Norm layers
Note that from stage 3, the first conv layer at main path is with subsample=(2,2)
And the shortcut should have subsample=(2,2) as well
"""
nb_filter1, nb_filter2, nb_filter3 = filters
conv_name_base = ‘res‘ + str(stage) + block + ‘_branch‘
bn_name_base = ‘bn‘ + str(stage) + block + ‘_branch‘
x = KL.Conv2D(nb_filter1, (1, 1), strides=strides, name=conv_name_base + ‘2a‘, use_bias=use_bias)(input_tensor)
x = BatchNorm(name=bn_name_base + ‘2a‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.Conv2D(nb_filter2, (kernel_size, kernel_size), padding=‘same‘,name=conv_name_base + ‘2b‘, use_bias=use_bias)(x)
#stride 默认为1
x = BatchNorm(name=bn_name_base + ‘2b‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.Conv2D(nb_filter3, (1, 1), name=conv_name_base + ‘2c‘, use_bias=use_bias)(x)
x = BatchNorm(name=bn_name_base + ‘2c‘)(x, training=train_bn)
shortcut = KL.Conv2D(nb_filter3, (1, 1), strides=strides, name=conv_name_base + ‘1‘, use_bias=use_bias)(input_tensor)
shortcut = BatchNorm(name=bn_name_base + ‘1‘)(shortcut, training=train_bn)
x = KL.Add()([x, shortcut]) # 将所有张量加起来,是每个对应元素的求和 x与shortcut张量维度及大小完全一致
x = KL.Activation(‘relu‘, name=‘res‘ + str(stage) + block + ‘_out‘)(x)
return x
def resnet_graph(input_image, architecture, stage5=False, train_bn=True):
"""Build a ResNet graph.
architecture: Can be resnet50 or resnet101
stage5: Boolean. If False, stage5 of the network is not created
train_bn: Boolean. Train or freeze Batch Norm layers
"""
assert architecture in ["resnet50", "resnet101"]
# Stage 1
x = KL.ZeroPadding2D((3, 3))(input_image) # w and h add three row and col format in default ways
x = KL.Conv2D(64, (7, 7), strides=(2, 2), name=‘conv1‘, use_bias=True)(x)
x = BatchNorm(name=‘bn_conv1‘)(x, training=train_bn) # BatchNorm 应该是继承了库函数的类
x = KL.Activation(‘relu‘)(x)
C1 = x = KL.MaxPooling2D((3, 3), strides=(2, 2), padding="same")(x)
# C1 and x have reduced the 4 times from original image after stage one
# Stage 2
x = conv_block(x, 3, [64, 64, 256], stage=2, block=‘a‘, strides=(1, 1), train_bn=train_bn) # the size of image above will not change ,because strides is (1,1) which have modified default value
x = identity_block(x, 3, [64, 64, 256], stage=2, block=‘b‘, train_bn=train_bn)
C2 = x = identity_block(x, 3, [64, 64, 256], stage=2, block=‘c‘, train_bn=train_bn)
# Stage 3
x = conv_block(x, 3, [128, 128, 512], stage=3, block=‘a‘, train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block=‘b‘, train_bn=train_bn)
x = identity_block(x, 3, [128, 128, 512], stage=3, block=‘c‘, train_bn=train_bn)
C3 = x = identity_block(x, 3, [128, 128, 512], stage=3, block=‘d‘, train_bn=train_bn)
# Stage 4
x = conv_block(x, 3, [256, 256, 1024], stage=4, block=‘a‘, train_bn=train_bn)
block_count = {"resnet50": 5, "resnet101": 22}[architecture] # 挺巧妙的
for i in range(block_count):
x = identity_block(x, 3, [256, 256, 1024], stage=4, block=chr(98 + i), train_bn=train_bn)
C4 = x
# Stage 5
if stage5:
x = conv_block(x, 3, [512, 512, 2048], stage=5, block=‘a‘, train_bn=train_bn)
x = identity_block(x, 3, [512, 512, 2048], stage=5, block=‘b‘, train_bn=train_bn)
C5 = identity_block(x, 3, [512, 512, 2048], stage=5, block=‘c‘, train_bn=train_bn)
else:
C5 = None
return [C1, C2, C3, C4, C5]
############################################################
# Proposal Layer
############################################################
def apply_box_deltas_graph(boxes, deltas):
"""Applies the given deltas to the given boxes.
boxes: [N, (y1, x1, y2, x2)] boxes to update
deltas: [N, (dy, dx, log(dh), log(dw))] refinements to apply
"""
# Convert to y, x, h, w
height = boxes[:, 2] - boxes[:, 0]
width = boxes[:, 3] - boxes[:, 1]
center_y = boxes[:, 0] + 0.5 * height
center_x = boxes[:, 1] + 0.5 * width
# Apply deltas
center_y += deltas[:, 0] * height
center_x += deltas[:, 1] * width
height *= tf.exp(deltas[:, 2])
width *= tf.exp(deltas[:, 3])
# Convert back to y1, x1, y2, x2
y1 = center_y - 0.5 * height
x1 = center_x - 0.5 * width
y2 = y1 + height
x2 = x1 + width
result = tf.stack([y1, x1, y2, x2], axis=1, name="apply_box_deltas_out")
return result
def clip_boxes_graph(boxes, window):
"""
boxes: [N, (y1, x1, y2, x2)]
window: [4] in the form y1, x1, y2, x2
"""
# Split
wy1, wx1, wy2, wx2 = tf.split(window, 4)
y1, x1, y2, x2 = tf.split(boxes, 4, axis=1)
# Clip
y1 = tf.maximum(tf.minimum(y1, wy2), wy1)
x1 = tf.maximum(tf.minimum(x1, wx2), wx1)
y2 = tf.maximum(tf.minimum(y2, wy2), wy1)
x2 = tf.maximum(tf.minimum(x2, wx2), wx1)
clipped = tf.concat([y1, x1, y2, x2], axis=1, name="clipped_boxes")
clipped.set_shape((clipped.shape[0], 4))
return clipped
class ProposalLayer(KE.Layer):
"""Receives anchor scores and selects a subset to pass as proposals
to the second stage. Filtering is done based on anchor scores and
non-max suppression to remove overlaps. It also applies bounding
box refinement deltas to anchors.
Inputs:
rpn_probs: [batch, num_anchors, (bg prob, fg prob)]
rpn_bbox: [batch, num_anchors, (dy, dx, log(dh), log(dw))]
anchors: [batch, num_anchors, (y1, x1, y2, x2)] anchors in normalized coordinates
Returns:
Proposals in normalized coordinates [batch, rois, (y1, x1, y2, x2)]
"""
def __init__(self, proposal_count, nms_threshold, config=None, **kwargs):
super(ProposalLayer, self).__init__(**kwargs) # adopt super function to call parent class original function
self.config = config
self.proposal_count = proposal_count
self.nms_threshold = nms_threshold
def call(self, inputs):
# Box Scores. Use the foreground class confidence. [Batch, num_rois, 1]
scores = inputs[0][:, :, 1] # 变成了[Batch, num_rois] 取的前景
# Box deltas [batch, num_rois, 4]
deltas = inputs[1]
deltas = deltas * np.reshape(self.config.RPN_BBOX_STD_DEV, [1, 1, 4]) # RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# 上一行代码相当于增加了一个维度
# Anchors
anchors = inputs[2]
# Improve performance by trimming to top anchors by score
# and doing the rest on the smaller subset.
pre_nms_limit = tf.minimum(self.config.PRE_NMS_LIMIT, tf.shape(anchors)[1]) # 返回最小的值 # self.config.PRE_NMS_LIMIT=6000 tf.shape(anchors)[1] = num_anchors
ix = tf.nn.top_k(scores, pre_nms_limit, sorted=True, name="top_anchors").indices # indices 只是那一行的值,将会从大到小排序 # 返回每行最大k个值与对应的索引 value and indices
# scores 是n行2列
scores = batch_slice([scores, ix], lambda x, y: tf.gather(x, y), self.config.batch_size) # IMAGES_PER_GPU=2 [?,?]
deltas = batch_slice([deltas, ix], lambda x, y: tf.gather(x, y), self.config.batch_size) # [?,?,?]
pre_nms_anchors = batch_slice([anchors, ix], lambda a, x: tf.gather(a, x), self.config.batch_size, names=["pre_nms_anchors"]) #[?,?,?]
# Apply deltas to anchors to get refined anchors.
# [batch, N, (y1, x1, y2, x2)]
boxes = batch_slice([pre_nms_anchors, deltas], lambda x, y: apply_box_deltas_graph(x, y), self.config.batch_size, names=["refined_anchors"])
# boxes 是坐标 预测出来的delta是中心点与高和宽
# Clip to image boundaries. Since we‘re in normalized coordinates,
# clip to 0..1 range. [batch, N, (y1, x1, y2, x2)]
window = np.array([0, 0, 1, 1], dtype=np.float32)
boxes = batch_slice(boxes,
lambda x: clip_boxes_graph(x, window),
self.config.batch_size,
names=["refined_anchors_clipped"])
# Filter out small boxes
# According to Xinlei Chen‘s paper, this reduces detection accuracy
# for small objects, so we‘re skipping it.
# Non-max suppression
def nms(boxes, scores):
indices = tf.image.non_max_suppression(
boxes, scores, self.proposal_count,
self.nms_threshold, name="rpn_non_max_suppression")
proposals = tf.gather(boxes, indices)
# Pad if needed
padding = tf.maximum(self.proposal_count - tf.shape(proposals)[0], 0)
proposals = tf.pad(proposals, [(0, padding), (0, 0)])
return proposals
proposals = batch_slice([boxes, scores], nms, self.config.batch_size)
return proposals
def compute_output_shape(self, input_shape):
return (None, self.proposal_count, 4)
############################################################
# ROIAlign Layer
############################################################
# def log2_graph(x):
# """Implementation of Log2. TF doesn‘t have a native implementation."""
# return tf.log(x) / tf.log(2.0)
class PyramidROIAlign(KE.Layer):
"""Implements ROI Pooling on multiple levels of the feature pyramid.
Params:
- pool_shape: [pool_height, pool_width] of the output pooled regions. Usually [7, 7]
Inputs:
- boxes: [batch, num_boxes, (y1, x1, y2, x2)] in normalized
coordinates. Possibly padded with zeros if not enough
boxes to fill the array.
- image_meta: [batch, (meta data)] Image details. See compose_image_meta()
- feature_maps: List of feature maps from different levels of the pyramid.
Each is [batch, height, width, channels]
Output:
Pooled regions in the shape: [batch, num_boxes, pool_height, pool_width, channels].
The width and height are those specific in the pool_shape in the layer
constructor.
"""
def __init__(self, pool_shape, **kwargs):
super(PyramidROIAlign, self).__init__(**kwargs)
self.pool_shape = tuple(pool_shape)
def call(self, inputs):
# Crop boxes [batch, num_boxes, (y1, x1, y2, x2)] in normalized coords
boxes = inputs[0]
# Image meta
# Holds details about the image. See compose_image_meta()
image_meta = inputs[1]
# Feature Maps. List of feature maps from different level of the
# feature pyramid. Each is [batch, height, width, channels]
feature_maps = inputs[2:]
# Assign each ROI to a level in the pyramid based on the ROI area.
y1, x1, y2, x2 = tf.split(boxes, 4, axis=2) # [p2,p3,p4,p5]
h = y2 - y1
w = x2 - x1
# Use shape of first image. Images in a batch must have the same size.
image_shape = parse_image_meta_graph(image_meta)[‘image_shape‘][0] # 使用 "image_shape": image_shape
# return {"image_id": image_id,"original_image_shape": original_image_shape,
# "image_shape": image_shape,"window": window,"scale": scale, "active_class_ids": active_class_ids, }
# Equation 1 in the Feature Pyramid Networks paper. Account for
# the fact that our coordinates are normalized here.
# e.g. a 224x224 ROI (in pixels) maps to P4
image_area = tf.cast(image_shape[0] * image_shape[1], tf.float32)
roi_level=tf.log(tf.sqrt(h * w) / (224.0 / tf.sqrt(image_area))) / tf.log(2.0)
roi_level = tf.minimum(5, tf.maximum(2, 4 + tf.cast(tf.round(roi_level), tf.int32)))
roi_level = tf.squeeze(roi_level, 2)
# Loop through levels and apply ROI pooling to each. P2 to P5.
pooled = []
box_to_level = [] # 保存每一层的索引号
for i, level in enumerate(range(2, 6)):
ix = tf.where(tf.equal(roi_level, level)) # ix为每一层的索引号,与boxes对应
level_boxes = tf.gather_nd(boxes, ix) # 提取单层的boxes
# Box indices for crop_and_resize.
box_indices = tf.cast(ix[:, 0], tf.int32) # 变成如此格式[ 6 7 8 9 17 18]
# Keep track of which box is mapped to which level
box_to_level.append(ix)
# Stop gradient propogation to ROI proposals
level_boxes = tf.stop_gradient(level_boxes)
box_indices = tf.stop_gradient(box_indices)
# Crop and Resize
# From Mask R-CNN paper: "We sample four regular locations, so
# that we can evaluate either max or average pooling. In fact,
# interpolating only a single value at each bin center (without
# pooling) is nearly as effective."
#
# Here we use the simplified approach of a single value per bin,
# which is how it‘s done in tf.crop_and_resize()
# Result: [batch * num_boxes, pool_height, pool_width, channels]
pooled.append(tf.image.crop_and_resize(feature_maps[i], level_boxes, box_indices, self.pool_shape, method="bilinear")) # 双线性插值
# 特征图(batch) 对应batch选的box 挑选对应batch特征
#上一步代码将对应batch的对应特征层的特征图挑选出来,然后用该层对应的特征box去框住,然后pooling出pool_shape的尺寸
# Pack pooled features into one tensor
pooled = tf.concat(pooled, axis=0) # 按行拼接,则列的维度不变,行的维度增加
# Pack box_to_level mapping into one array and add another
# column representing the order of pooled boxes
box_to_level = tf.concat(box_to_level, axis=0) # 按行拼接,则列的维度不变,行的维度增加 保存box对应的层
box_range = tf.expand_dims(tf.range(tf.shape(box_to_level)[0]), 1) # [len(box_to_level[0]),1] 大概形式为:[[2],[3],[1],[0]....]
box_to_level = tf.concat([tf.cast(box_to_level, tf.int32), box_range], axis=1)
‘‘‘
box_to_level=[1,12,0]
[2,24,1]
[0,31,2]
...
‘‘‘
# Rearrange pooled features to match the order of the original boxes
# Sort box_to_level by batch then box index
# TF doesn‘t have a way to sort by two columns, so merge them and sort.
sorting_tensor = box_to_level[:, 0] * 100000 + box_to_level[:, 1]
ix = tf.nn.top_k(sorting_tensor, k=tf.shape(box_to_level)[0]).indices[::-1] # [ 8 9 10 11] 用来存储索引,
‘‘‘
[[ 0.98925872 0.15743092 0.76471106 0.5949957 ]
[ 0.95766488 0.67846336 0.21058844 0.2644312 ]
[ 0.65531991 0.61445187 0.65372938 0.88111084]]
TopKV2(values=array([[ 0.98925872, 0.76471106],
[ 0.95766488, 0.67846336],
[ 0.88111084, 0.65531991]]), indices=array([[0, 2],
[0, 1],
[3, 0]]))
‘‘‘
ix = tf.gather(box_to_level[:, 2], ix) # boxes所对应的索引号
pooled = tf.gather(pooled, ix) #
# Re-add the batch dimension
shape = tf.concat([tf.shape(boxes)[:2], tf.shape(pooled)[1:]], axis=0) # [ 4 number 7 7 256]
pooled = tf.reshape(pooled, shape)
return pooled
def compute_output_shape(self, input_shape):
return input_shape[0][:2] + self.pool_shape + (input_shape[2][-1], )
############################################################
# Detection Target Layer
############################################################
def overlaps_graph(boxes1, boxes2):
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
"""
# 1. Tile boxes2 and repeat boxes1. This allows us to compare
# every boxes1 against every boxes2 without loops.
# TF doesn‘t have an equivalent to np.repeat() so simulate it
# using tf.tile() and tf.reshape.
b1 = tf.reshape(tf.tile(tf.expand_dims(boxes1, 1), [1, 1, tf.shape(boxes2)[0]]), [-1, 4])
b2 = tf.tile(boxes2, [tf.shape(boxes1)[0], 1])
# 2. Compute intersections
b1_y1, b1_x1, b1_y2, b1_x2 = tf.split(b1, 4, axis=1)
b2_y1, b2_x1, b2_y2, b2_x2 = tf.split(b2, 4, axis=1)
y1 = tf.maximum(b1_y1, b2_y1)
x1 = tf.maximum(b1_x1, b2_x1)
y2 = tf.minimum(b1_y2, b2_y2)
x2 = tf.minimum(b1_x2, b2_x2)
intersection = tf.maximum(x2 - x1, 0) * tf.maximum(y2 - y1, 0)
# 3. Compute unions
b1_area = (b1_y2 - b1_y1) * (b1_x2 - b1_x1)
b2_area = (b2_y2 - b2_y1) * (b2_x2 - b2_x1)
union = b1_area + b2_area - intersection
# 4. Compute IoU and reshape to [boxes1, boxes2]
iou = intersection / union
overlaps = tf.reshape(iou, [tf.shape(boxes1)[0], tf.shape(boxes2)[0]])
return overlaps
def box_refinement_graph(box, gt_box): # all of dimension must same
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]
"""
box = tf.cast(box, tf.float32)
gt_box = tf.cast(gt_box, tf.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = tf.log(gt_height / height)
dw = tf.log(gt_width / width)
result = tf.stack([dy, dx, dh, dw], axis=1) # 真实box经过变换后得到的delta
return result
def detection_targets_graph(proposals, gt_class_ids, gt_boxes, gt_masks, config):
‘‘‘
DetectionTargetLayer的输入包含了,target_rois, input_gt_class_ids, gt_boxes, input_gt_masks。
其中target_rois是ProposalLayer输出的结果。首先,计算target_rois中的每一个rois和哪一个真实的框gt_boxes iou值,
如果最大的iou大于0.5,则被认为是正样本,负样本是是iou小于0.5并且和crowd box相交不大的anchor,选择出了正负样本,
还要保证样本的均衡性,具体可以才配置文件中进行配置。最后计算了正样本中的anchor和哪一个真实的框最接近,
用真实的框和anchor计算出偏移值,并且将mask的大小resize成28*28的
:param proposals:
:param gt_class_ids:
:param gt_boxes:
:param gt_masks:
:param config:
:return:
‘‘‘
"""Generates detection targets for one image. Subsamples proposals and
generates target class IDs, bounding box deltas, and masks for each.
Inputs:
proposals: [POST_NMS_ROIS_TRAINING, (y1, x1, y2, x2)] in normalized coordinates. Might
be zero padded if there are not enough proposals.
gt_class_ids: [MAX_GT_INSTANCES] int class IDs
gt_boxes: [MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
gt_masks: [height, width, MAX_GT_INSTANCES] of boolean type.
Returns: Target ROIs and corresponding class IDs, bounding box shifts,
and masks.
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs. Zero padded.
deltas: [TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw))]
masks: [TRAIN_ROIS_PER_IMAGE, height, width]. Masks cropped to bbox
boundaries and resized to neural network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
# Assertions
asserts = [
tf.Assert(tf.greater(tf.shape(proposals)[0], 0), [proposals],
name="roi_assertion"),
]
with tf.control_dependencies(asserts):
proposals = tf.identity(proposals)
# Remove zero padding
proposals, _ = trim_zeros_graph(proposals, name="trim_proposals")
gt_boxes, non_zeros = trim_zeros_graph(gt_boxes, name="trim_gt_boxes")
gt_class_ids = tf.boolean_mask(gt_class_ids, non_zeros, name="trim_gt_class_ids")
gt_masks = tf.gather(gt_masks, tf.where(non_zeros)[:, 0], axis=2, name="trim_gt_masks")
# Handle COCO crowds
# 在coco数据集中,有的框会标注很多的物体,在训练中,去掉这些框
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = tf.where(gt_class_ids < 0)[:, 0] #
non_crowd_ix = tf.where(gt_class_ids > 0)[:, 0]
crowd_boxes = tf.gather(gt_boxes, crowd_ix)
# 下面就是一张图片中真实存在的物体用于训练,已经排除了crowd box
gt_class_ids = tf.gather(gt_class_ids, non_crowd_ix)
gt_boxes = tf.gather(gt_boxes, non_crowd_ix)
gt_masks = tf.gather(gt_masks, non_crowd_ix, axis=2)
# Compute overlaps matrix [proposals, gt_boxes]
overlaps = overlaps_graph(proposals, gt_boxes)
# Compute overlaps with crowd boxes [proposals, crowd_boxes]
crowd_overlaps = overlaps_graph(proposals, crowd_boxes)
crowd_iou_max = tf.reduce_max(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001) # 计算拥挤的crowd box
# Determine positive and negative ROIs
roi_iou_max = tf.reduce_max(overlaps, axis=1)
‘‘‘
上一行代码的解释:
roi_iou_max= [[0.76174609 0.80333894 0.68258544 0.57697359 0.85310562]
[0.43019702 0.52369922 0.97526372 0.73503863 0.57165666]
[0.35172219 0.23619196 0.50828622 0.60014882 0.67331094]
[0.15814392 0.68016351 0.08231241 0.47771463 0.69517046]]
返回值:[0.85310562 0.97526372 0.67331094 0.69517046]
‘‘‘
# 1. Positive ROIs are those with >= 0.5 IoU with a GT box
positive_roi_bool = (roi_iou_max >= 0.5) # eg:[ True False False False False True],实际该顺序代表proposal的顺序
positive_indices = tf.where(positive_roi_bool)[:, 0] # eg:[0 5] # 按overlaps的行挑选 # 得到 positive_roi_bppl满足的位置
# 2. Negative ROIs are those with < 0.5 with every GT box. Skip crowds.
negative_indices = tf.where(tf.logical_and(roi_iou_max < 0.5, no_crowd_bool))[:, 0] ################## 维度不一样无法运行
# Subsample ROIs. Aim for 33% positive
# Positive ROIs
positive_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO) # TRAIN_ROIS_PER_IMAGE = 200 ROI_POSITIVE_RATIO = 0.33 (32,0.33)
positive_indices = tf.random_shuffle(positive_indices)[:positive_count] #[:positive_count]中的positive_count超过positive_indices数量,则全部shuffle
# 从proposals个中选择IOU值>=0.5后的序列,随机排序后,挑选前positive_count
positive_count = tf.shape(positive_indices)[0]
# Negative ROIs. Add enough to maintain positive:negative ratio.
r = 1.0 / config.ROI_POSITIVE_RATIO # 1/0.33
negative_count = tf.cast(r * tf.cast(positive_count, tf.float32), tf.int32) - positive_count # positive_count*3-positive_count
negative_indices = tf.random_shuffle(negative_indices)[:negative_count] # 从proposals个中选择IOU值<0.5后的序列,随机排序后,挑选前negative_count
# Gather selected ROIs
positive_rois = tf.gather(proposals, positive_indices) # 按positive_indices选择出len(positive_indices)个proposals的正样本
negative_rois = tf.gather(proposals, negative_indices)# 按negative_indices选择出len(negative_indices)个proposals
# Assign positive ROIs to GT boxes.
positive_overlaps = tf.gather(overlaps, positive_indices) # 通过变换得到positive_indices索引(按行挑选的),实际上面代码只是做了正样本与负样本中如何挑选在len(proposal)中随机len(positive_indices)的正样本
# 以下函数实际为选择后的positive_overlaps正样本取寻找对应的标签
roi_gt_box_assignment = tf.cond( # tf.cond(条件,a,b)条件满足输出a,否则b
tf.greater(tf.shape(positive_overlaps)[1], 0), # tf.greater(a,b) 必须满足a>b输出为True
true_fn = lambda: tf.argmax(positive_overlaps, axis=1), # 输出该行列中最大的值
false_fn = lambda: tf.cast(tf.constant([]),tf.int64)
)
roi_gt_boxes = tf.gather(gt_boxes, roi_gt_box_assignment) # 寻找到对应标签后,开始提取对应的真实box,即从提取网络后进行处理选择的正样本positive_overlaps所对应的box
roi_gt_class_ids = tf.gather(gt_class_ids, roi_gt_box_assignment) # 寻找到对应标签后,开始提取对应的真实class_ids,即从提取网络后进行处理选择的正样本positive_overlaps所对应的class_ids
# Compute bbox refinement for positive ROIs
deltas = box_refinement_graph(positive_rois, roi_gt_boxes) # positive_rois, roi_gt_boxes 一一对应的 #[dy,dx,dh,dw]
deltas /= config.BBOX_STD_DEV # BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Assign positive ROIs to GT masks
# Permute masks to [N, height, width, 1]
transposed_masks = tf.expand_dims(tf.transpose(gt_masks, [2, 0, 1]), -1) # 调换维度位置, 并增加最后一个维度
# Pick the right mask for each ROI
# 根据roi_gt_box_assignment采集正样本mask
roi_masks = tf.gather(transposed_masks, roi_gt_box_assignment) # 寻找到对应标签后,开始提取对应的真实mask,即从提取网络后进行处理选择的正样本positive_overlaps所对应的mask
# Compute mask targets
boxes = positive_rois # boxes置为positive_rois,即正样本推荐框
# if config.USE_MINI_MASK:
# ‘‘‘
# 如果采用mini_mask,则需要在这里将positive_rois转换到roi_gt_boxes的范围内,
# 因为mini_mask仅仅记录了gt_boxes内部的mask信息
# 正如作者解释注释的"We store mask pixels that are inside the object bounding box,
# ‘‘‘
# # Transform ROI coordinates from normalized image space
# # to normalized mini-mask space.
# y1, x1, y2, x2 = tf.split(positive_rois, 4, axis=1)
# gt_y1, gt_x1, gt_y2, gt_x2 = tf.split(roi_gt_boxes, 4, axis=1)
# gt_h = gt_y2 - gt_y1
# gt_w = gt_x2 - gt_x1
# y1 = (y1 - gt_y1) / gt_h
# x1 = (x1 - gt_x1) / gt_w
# y2 = (y2 - gt_y1) / gt_h
# x2 = (x2 - gt_x1) / gt_w
# boxes = tf.concat([y1, x1, y2, x2], 1)
#
box_ids = tf.range(0, tf.shape(roi_masks)[0])
masks = tf.image.crop_and_resize(tf.cast(roi_masks, tf.float32), boxes, box_ids, config.MASK_SHAPE) # MASK_SHAPE = [28, 28] # roi_masks必须是A 4-D tensor of shape [batch, image_height, image_width, depth]
# roi_masks与boxes一一对应,其中boxes来源挑选的网络输出框。
# 经过尝试 box_ids 将roi_masks[0]被boxes[0]截取,并resize成mask_shape尺寸(28,28),以此类推,最终输出roi_masks.shape[0]个图像,即为mask图
‘‘‘
这个函数操作相当于RoiPooling操作,函数原型是:
def crop_and_resize(image, boxes, box_ind, crop_size, method=None, extrapolation_value=None, name=None):
参数解释:
image:表示特征图,最终得到的每个proposal的特征图从这个特征图上得到
boxes:表示每个proposal的坐标(N,4)一般是
box_ind:表示proposal是来自mini_batch中的哪一张图片
crop_size:表示Roi_pooling之后的大小
‘‘‘
# Remove the extra dimension from masks.
‘‘‘
# ‘t‘ is a tensor of shape [1, 2, 1, 3, 1, 1]
shape(squeeze(t)) ==> [2, 3]
Or, to remove specific size 1 dimensions:
# ‘t‘ is a tensor of shape [1, 2, 1, 3, 1, 1]
shape(squeeze(t, [2, 4])) ==> [1, 2, 3, 1]
‘‘‘
masks = tf.squeeze(masks, axis=3) # [roi_gt_box_assignment,height,width,1] axis=3的维度必须是1,否则会报错
# Threshold mask pixels at 0.5 to have GT masks be 0 or 1 to use with binary cross entropy loss.
masks = tf.round(masks) # 四舍五入 0.5是舍弃的
# Append negative ROIs and pad bbox deltas and masks that are not used for negative ROIs with zeros.
rois = tf.concat([positive_rois, negative_rois], axis=0)
N = tf.shape(negative_rois)[0]
P = tf.maximum(config.TRAIN_ROIS_PER_IMAGE - tf.shape(rois)[0], 0) # TRAIN_ROIS_PER_IMAGE = 32
rois = tf.pad(rois, [(0, P), (0, 0)])
roi_gt_boxes = tf.pad(roi_gt_boxes, [(0, N + P), (0, 0)])
roi_gt_class_ids = tf.pad(roi_gt_class_ids, [(0, N + P)])
deltas = tf.pad(deltas, [(0, N + P), (0, 0)])
masks = tf.pad(masks, [[0, N + P], (0, 0), (0, 0)])
‘‘‘
通过rpn网络得到的anchor,选择出来正负样本,并计算出正样本和真实框的差距,以及要预测的mask的值,
这些都是在后面的网络中计算损失函数需要的真实值
‘‘‘
# 返回的变量按第一维度的上面是正样本对应的,下面是负样本,且对应的负样本其它变量用0填充
return rois, roi_gt_class_ids, deltas, masks
class DetectionTargetLayer(KE.Layer):
"""Subsamples proposals and generates target box refinement, class_ids,
and masks for each.
Inputs:
proposals: [batch, N, (y1, x1, y2, x2)] in normalized coordinates. Might be zero padded if there are not enough proposals.
gt_class_ids: [batch, MAX_GT_INSTANCES] Integer class IDs.
gt_boxes: [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in normalized coordinates.
gt_masks: [batch, height, width, MAX_GT_INSTANCES] of boolean type
Returns: Target ROIs and corresponding class IDs, bounding box shifts, and masks.
rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
target_mask: [batch, TRAIN_ROIS_PER_IMAGE, height, width]
Masks cropped to bbox boundaries and resized to neural
network output size.
Note: Returned arrays might be zero padded if not enough target ROIs.
"""
def __init__(self, config, **kwargs):
super(DetectionTargetLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
proposals = inputs[0]
gt_class_ids = inputs[1]
gt_boxes = inputs[2]
gt_masks = inputs[3]
# Slice the batch and run a graph for each slice
# TODO: Rename target_bbox to target_deltas for clarity
names = ["rois", "target_class_ids", "target_bbox", "target_mask"]
outputs = batch_slice([proposals, gt_class_ids, gt_boxes, gt_masks],
lambda w, x, y, z: detection_targets_graph(w, x, y, z, self.config), self.config.batch_size, names=names)
return outputs
def compute_output_shape(self, input_shape):
return [
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # rois
(None, self.config.TRAIN_ROIS_PER_IMAGE), # class_ids
(None, self.config.TRAIN_ROIS_PER_IMAGE, 4), # deltas
(None, self.config.TRAIN_ROIS_PER_IMAGE, self.config.MASK_SHAPE[0], self.config.MASK_SHAPE[1]) # masks
]
def compute_mask(self, inputs, mask=None):
return [None, None, None, None]
############################################################
# Detection Layer
############################################################
def refine_detections_graph(rois, probs, deltas, window, config):
"""Refine classified proposals and filter overlaps and return final
detections.
Inputs:
rois: [N, (y1, x1, y2, x2)] in normalized coordinates
probs: [N, num_classes]. Class probabilities.
deltas: [N, num_classes, (dy, dx, log(dh), log(dw))]. Class-specific
bounding box deltas.
window: (y1, x1, y2, x2) in normalized coordinates. The part of the image
that contains the image excluding the padding.
Returns detections shaped: [num_detections, (y1, x1, y2, x2, class_id, score)] where
coordinates are normalized.
"""
# Class IDs per ROI
class_ids = tf.argmax(probs, axis=1, output_type=tf.int32)
# Class probability of the top class of each ROI
indices = tf.stack([tf.range(probs.shape[0]), class_ids], axis=1)
class_scores = tf.gather_nd(probs, indices)
# Class-specific bounding box deltas
deltas_specific = tf.gather_nd(deltas, indices)
# Apply bounding box deltas
# Shape: [boxes, (y1, x1, y2, x2)] in normalized coordinates
refined_rois = apply_box_deltas_graph(
rois, deltas_specific * config.BBOX_STD_DEV)
# Clip boxes to image window
refined_rois = clip_boxes_graph(refined_rois, window)
# TODO: Filter out boxes with zero area
# Filter out background boxes
keep = tf.where(class_ids > 0)[:, 0]
# Filter out low confidence boxes
if config.DETECTION_MIN_CONFIDENCE:
conf_keep = tf.where(class_scores >= config.DETECTION_MIN_CONFIDENCE)[:, 0]
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(conf_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Apply per-class NMS
# 1. Prepare variables
pre_nms_class_ids = tf.gather(class_ids, keep)
pre_nms_scores = tf.gather(class_scores, keep)
pre_nms_rois = tf.gather(refined_rois, keep)
unique_pre_nms_class_ids = tf.unique(pre_nms_class_ids)[0]
def nms_keep_map(class_id):
"""Apply Non-Maximum Suppression on ROIs of the given class."""
# Indices of ROIs of the given class
ixs = tf.where(tf.equal(pre_nms_class_ids, class_id))[:, 0]
# Apply NMS
class_keep = tf.image.non_max_suppression(
tf.gather(pre_nms_rois, ixs),
tf.gather(pre_nms_scores, ixs),
max_output_size=config.DETECTION_MAX_INSTANCES,
iou_threshold=config.DETECTION_NMS_THRESHOLD)
# Map indices
class_keep = tf.gather(keep, tf.gather(ixs, class_keep))
# Pad with -1 so returned tensors have the same shape
gap = config.DETECTION_MAX_INSTANCES - tf.shape(class_keep)[0]
class_keep = tf.pad(class_keep, [(0, gap)],
mode=‘CONSTANT‘, constant_values=-1) # 补齐位置时候用-1填充
# Set shape so map_fn() can infer result shape
class_keep.set_shape([config.DETECTION_MAX_INSTANCES])
return class_keep
# class_keep 得到的索引号就是原来class_ids中所拥有索引,指定索引是几就对应class_ids中的位置
# 2. Map over class IDs
nms_keep = tf.map_fn(nms_keep_map, unique_pre_nms_class_ids,
dtype=tf.int64)
# 3. Merge results into one list, and remove -1 padding
nms_keep = tf.reshape(nms_keep, [-1])
nms_keep = tf.gather(nms_keep, tf.where(nms_keep > -1)[:, 0])
# 4. Compute intersection between keep and nms_keep
keep = tf.sets.set_intersection(tf.expand_dims(keep, 0),
tf.expand_dims(nms_keep, 0))
keep = tf.sparse_tensor_to_dense(keep)[0]
# Keep top detections
roi_count = config.DETECTION_MAX_INSTANCES
class_scores_keep = tf.gather(class_scores, keep)
num_keep = tf.minimum(tf.shape(class_scores_keep)[0], roi_count)
top_ids = tf.nn.top_k(class_scores_keep, k=num_keep, sorted=True)[1]
keep = tf.gather(keep, top_ids)
# Arrange output as [N, (y1, x1, y2, x2, class_id, score)]
# Coordinates are normalized.
detections = tf.concat([
tf.gather(refined_rois, keep),
tf.to_float(tf.gather(class_ids, keep))[..., tf.newaxis],
tf.gather(class_scores, keep)[..., tf.newaxis]
], axis=1)
# Pad with zeros if detections < DETECTION_MAX_INSTANCES
gap = config.DETECTION_MAX_INSTANCES - tf.shape(detections)[0]
detections = tf.pad(detections, [(0, gap), (0, 0)], "CONSTANT")
return detections
class DetectionLayer(KE.Layer):
"""Takes classified proposal boxes and their bounding box deltas and
returns the final detection boxes.
Returns:
[batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] where
coordinates are normalized.
"""
def __init__(self, config=None, **kwargs):
super(DetectionLayer, self).__init__(**kwargs)
self.config = config
def call(self, inputs):
rois = inputs[0]
mrcnn_class = inputs[1]
mrcnn_bbox = inputs[2]
image_meta = inputs[3]
# Get windows of images in normalized coordinates. Windows are the area
# in the image that excludes the padding.
# Use the shape of the first image in the batch to normalize the window
# because we know that all images get resized to the same size.
m = parse_image_meta_graph(image_meta)
image_shape = m[‘image_shape‘][0]
window = norm_boxes_graph(m[‘window‘], image_shape[:2])
# Run detection refinement graph on each item in the batch
detections_batch = batch_slice(
[rois, mrcnn_class, mrcnn_bbox, window],
lambda x, y, w, z: refine_detections_graph(x, y, w, z, self.config),
self.config.batch_size)
# Reshape output
# [batch, num_detections, (y1, x1, y2, x2, class_id, class_score)] in
# normalized coordinates
return tf.reshape(
detections_batch,
[self.config.batch_size, self.config.DETECTION_MAX_INSTANCES, 6])
def compute_output_shape(self, input_shape):
return (None, self.config.DETECTION_MAX_INSTANCES, 6)
############################################################
# Region Proposal Network (RPN)
############################################################
def build_rpn_model(anchor_stride, anchors_per_location, depth):
"""Builds a Keras model of the Region Proposal Network.
It wraps the RPN graph so it can be used multiple times with shared
weights.
anchors_per_location: number of anchors per pixel in the feature map
anchor_stride: Controls the density of anchors. Typically 1 (anchors for
every pixel in the feature map), or 2 (every other pixel).
depth: Depth of the backbone feature map.
Returns a Keras Model object. The model outputs, when called, are:
rpn_class_logits: [batch, H * W * anchors_per_location, 2] Anchor classifier logits (before softmax)
rpn_probs: [batch, H * W * anchors_per_location, 2] Anchor classifier probabilities.
rpn_bbox: [batch, H * W * anchors_per_location, (dy, dx, log(dh), log(dw))] Deltas to be
applied to anchors.
"""
input_feature_map = KL.Input(shape=[None, None, depth], name="input_rpn_feature_map")
# TODO: check if stride of 2 causes alignment issues if the feature map
# is not even.
# Shared convolutional base of the RPN
shared = KL.Conv2D(512, (3, 3), padding=‘same‘, activation=‘relu‘, strides=anchor_stride,
name=‘rpn_conv_shared‘)(input_feature_map)
# Anchor Score. [batch, height, width, anchors per location * 2].
# 下面一句代码的含义,输出图片大小没有改变,因为卷积核为(1,1),输出图片为6张,而每一张图片都有height,width
# 所以最终输出为[batch, H * W * anchors_per_location, 2]的数量,没毛病
x = KL.Conv2D(2 * anchors_per_location, (1, 1), padding=‘valid‘, activation=‘linear‘, name=‘rpn_class_raw‘)(shared)
# Reshape to [batch, anchors, 2]=[batch,3,2]
rpn_class_logits = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 2]))(x)
# Softmax on last dimension of BG/FG.
rpn_probs = KL.Activation("softmax", name="rpn_class_xxx")(rpn_class_logits)
# Bounding box refinement. [batch, H, W, anchors per location * depth]
# where depth is [x, y, log(w), log(h)]
x = KL.Conv2D(anchors_per_location * 4, (1, 1), padding="valid", activation=‘linear‘, name=‘rpn_bbox_pred‘)(shared)
# Reshape to [batch, anchors, 4]
rpn_bbox = KL.Lambda(lambda t: tf.reshape(t, [tf.shape(t)[0], -1, 4]))(x)
outputs = [rpn_class_logits, rpn_probs, rpn_bbox]
return KM.Model([input_feature_map], outputs, name="rpn_model")
############################################################
# Feature Pyramid Network Heads
############################################################
def fpn_classifier_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn=True, fc_layers_size=1024):
"""Builds the computation graph of the feature pyramid network classifier and regressor heads.
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution. # 256 for every layer
image_meta: [batch, (meta data)] Image details. See compose_image_meta() [batch,1+3+3+4+1+num_class]
pool_size: The width of the square feature map generated from ROI Pooling.
meta = np.array(
[image_id] + # size=1
list(original_image_shape) + # size=3
list(image_shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(active_class_ids) # size=num_classes
)
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layers
fc_layers_size: Size of the 2 FC layers
Returns:
logits: [batch, num_rois, NUM_CLASSES] classifier logits (before softmax)
probs: [batch, num_rois, NUM_CLASSES] classifier probabilities
bbox_deltas: [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))] Deltas to apply to proposal boxes
"""
# ROI Pooling
# Shape: [batch, num_rois, POOL_SIZE, POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_classifier")([rois, image_meta] + feature_maps)
# Two 1024 FC layers (implemented with Conv2D for consistency)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (pool_size, pool_size), padding="valid"),
name="mrcnn_class_conv1")(x)
x = KL.TimeDistributed(BatchNorm(), name=‘mrcnn_class_bn1‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.TimeDistributed(KL.Conv2D(fc_layers_size, (1, 1)),
name="mrcnn_class_conv2")(x)
x = KL.TimeDistributed(BatchNorm(), name=‘mrcnn_class_bn2‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
shared = KL.Lambda(lambda x: K.squeeze(K.squeeze(x, 3), 2),
name="pool_squeeze")(x)
# Classifier head
mrcnn_class_logits = KL.TimeDistributed(KL.Dense(num_classes),
name=‘mrcnn_class_logits‘)(shared)
mrcnn_probs = KL.TimeDistributed(KL.Activation("softmax"),
name="mrcnn_class")(mrcnn_class_logits)
# BBox head
# [batch, num_rois, NUM_CLASSES * (dy, dx, log(dh), log(dw))]
x = KL.TimeDistributed(KL.Dense(num_classes * 4, activation=‘linear‘),
name=‘mrcnn_bbox_fc‘)(shared)
# Reshape to [batch, num_rois, NUM_CLASSES, (dy, dx, log(dh), log(dw))]
s = K.int_shape(x)
mrcnn_bbox = KL.Reshape((s[1], num_classes, 4), name="mrcnn_bbox")(x)
return mrcnn_class_logits, mrcnn_probs, mrcnn_bbox
def build_fpn_mask_graph(rois, feature_maps, image_meta, pool_size, num_classes, train_bn=True):
"""Builds the computation graph of the mask head of Feature Pyramid Network.
rois: [batch, num_rois, (y1, x1, y2, x2)] Proposal boxes in normalized coordinates.
feature_maps: List of feature maps from different layers of the pyramid,
[P2, P3, P4, P5]. Each has a different resolution.
image_meta: [batch, (meta data)] Image details. See compose_image_meta()
pool_size: The width of the square feature map generated from ROI Pooling.
num_classes: number of classes, which determines the depth of the results
train_bn: Boolean. Train or freeze Batch Norm layers
Returns: Masks [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, NUM_CLASSES]
"""
# ROI Pooling
# Shape: [batch, num_rois, MASK_POOL_SIZE, MASK_POOL_SIZE, channels]
x = PyramidROIAlign([pool_size, pool_size], name="roi_align_mask")([rois, image_meta] + feature_maps)
# Conv layers
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv1")(x)
x = KL.TimeDistributed(BatchNorm(),
name=‘mrcnn_mask_bn1‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv2")(x)
x = KL.TimeDistributed(BatchNorm(),
name=‘mrcnn_mask_bn2‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv3")(x)
x = KL.TimeDistributed(BatchNorm(),
name=‘mrcnn_mask_bn3‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.TimeDistributed(KL.Conv2D(256, (3, 3), padding="same"),
name="mrcnn_mask_conv4")(x)
x = KL.TimeDistributed(BatchNorm(),
name=‘mrcnn_mask_bn4‘)(x, training=train_bn)
x = KL.Activation(‘relu‘)(x)
x = KL.TimeDistributed(KL.Conv2DTranspose(256, (2, 2), strides=2, activation="relu"),
name="mrcnn_mask_deconv")(x)
x = KL.TimeDistributed(KL.Conv2D(num_classes, (1, 1), strides=1, activation="sigmoid"),
name="mrcnn_mask")(x)
return x
############################################################
# Loss Functions
############################################################
def smooth_l1_loss(y_true, y_pred):
"""
Implements Smooth-L1 loss. y_true and y_pred are typically: [N, 4], but could be any shape.
按照作者最正统的解释: 因为回归的targets没有明确的限制,因此可能会出现较大的错误的偏移去主导 loss 的情况,最终造成梯度爆炸,
使用 smooth L1 loss 能够更好地避免这种情况。
Smooth L1 Loss结合了L2 Loss收敛更快,且在0点有导数,便于收敛的好处。也在边界区域结合了L1 Loss的好处,让网络对异常值更加robust,
能够在偏移值较大时还能拉回来。
Smooth L1 Loss相比于L2 Loss对于离群点、异常值(outliers)更不敏感,或者说是更加鲁棒,可控制梯度的量级使训练时不容易跑飞。
(Fast R-CNN中的解释:L1 loss that is less sensitive to outliers than the L2 loss used in R-CNN and SPPnet)
原文链接:https://blog.csdn.net/ytusdc/article/details/86659696
"""
diff = K.abs(y_true - y_pred)
less_than_one = K.cast(K.less(diff, 1.0), "float32") # 逐个元素比对 (x < y) 的真值
loss = (less_than_one * 0.5 * diff**2) + (1 - less_than_one) * (diff - 0.5) # 当diff绝对值小于1时候less_than_one为1,否则为0,为计算此部分损失将用1-less_than_one
return loss
def rpn_class_loss_graph(rpn_match, rpn_class_logits):
"""RPN anchor classifier loss.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_class_logits: [batch, anchors, 2]. RPN classifier logits for BG/FG.
"""
# Squeeze last dim to simplify
rpn_match = tf.squeeze(rpn_match, -1) # 去掉维度
# Get anchor classes. Convert the -1/+1 match to 0/1 values.
anchor_class = K.cast(K.equal(rpn_match, 1), tf.int32) # rpn 等于1的全为True,否则为False,在经过转换变成1或0
# Positive and Negative anchors contribute to the loss,
# but neutral anchors (match value = 0) don‘t.
indices = tf.where(K.not_equal(rpn_match, 0)) # [n,2] 将不为0的位置找到 因为中立不需要进行loss计算,因此不需要。所以这一步是剔除得到需要进行loss计算的索引
# Pick rows that contribute to the loss and filter out the rest.
rpn_class_logits = tf.gather_nd(rpn_class_logits, indices) # 取了对应的batch与anchor的2列
anchor_class = tf.gather_nd(anchor_class, indices) # 将anchor_class 转变成0,1类,0表示背景,1表示前景
# Cross entropy loss
loss = K.sparse_categorical_crossentropy(target=anchor_class,
output=rpn_class_logits,
from_logits=True)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0)) # 根据一个标量值在两个操作之间切换,损失函数不能为负号
return loss
def rpn_bbox_loss_graph(config, target_bbox, rpn_match, rpn_bbox):
"""Return the RPN bounding box loss graph.
config: the model config object.
target_bbox: [batch, max positive anchors, (dy, dx, log(dh), log(dw))].
Uses 0 padding to fill in unsed bbox deltas.
rpn_match: [batch, anchors, 1]. Anchor match type. 1=positive,
-1=negative, 0=neutral anchor.
rpn_bbox: [batch, anchors, (dy, dx, log(dh), log(dw))]
"""
# Positive anchors contribute to the loss, but negative and
# neutral anchors (match value of 0 or -1) don‘t.
rpn_match = K.squeeze(rpn_match, -1) # 删除最后一个维度将三维变成二维 [batch,anchors]
indices = tf.where(K.equal(rpn_match, 1)) # 挑选正样本序列
# Pick bbox deltas that contribute to the loss
rpn_bbox = tf.gather_nd(rpn_bbox, indices) # 挑选正样本的bbox
# Trim target bounding box deltas to the same length as rpn_bbox.
batch_counts = K.sum(K.cast(K.equal(rpn_match, 1), tf.int32), axis=1) # 计算出每个batch有多少个 [batch]
# target_bbox = batch_pack_graph(target_bbox, batch_counts, config.batch_size)
"""Picks different number of values from each row in x depending on the values in counts.
"""
target_bbox_temp=target_bbox
outputs = []
for i in range(config.batch_size):
outputs.append(target_bbox_temp[i, :batch_counts[i]])
target_bbox = tf.concat(outputs, axis=0)
loss = smooth_l1_loss(target_bbox, rpn_bbox)
loss = K.switch(tf.size(loss) > 0, K.mean(loss), tf.constant(0.0))
return loss
def mrcnn_class_loss_graph(target_class_ids, pred_class_logits,active_class_ids):
"""Loss for the classifier head of Mask RCNN.
target_class_ids: [batch, num_rois]. Integer class IDs. Uses zero
padding to fill in the array.
pred_class_logits: [batch, num_rois, num_classes]
active_class_ids: [batch, num_classes]. Has a value of 1 for
classes that are in the dataset of the image, and 0
for classes that are not in the dataset.
"""
# During model building, Keras calls this function with
# target_class_ids of type float32. Unclear why. Cast it
# to int to get around it.
target_class_ids = tf.cast(target_class_ids, ‘int64‘)
# Find predictions of classes that are not in the dataset.
pred_class_ids = tf.argmax(pred_class_logits, axis=2)
# TODO: Update this line to work with batch > 1. Right now it assumes all images in a batch have the same active_class_ids
pred_active = tf.gather(active_class_ids[0], pred_class_ids)
# Loss
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target_class_ids, logits=pred_class_logits)
# Erase losses of predictions of classes that are not in the active
# classes of the image.
loss = loss * pred_active
# Computer loss mean. Use only predictions that contribute
# to the loss to get a correct mean.
loss = tf.reduce_sum(loss) / tf.reduce_sum(pred_active)
return loss
def mrcnn_bbox_loss_graph(target_bbox, target_class_ids, pred_bbox):
"""Loss for Mask R-CNN bounding box refinement.
target_bbox: [batch, num_rois, (dy, dx, log(dh), log(dw))]
target_class_ids: [batch, num_rois]. Integer class IDs.
pred_bbox: [batch, num_rois, num_classes, (dy, dx, log(dh), log(dw))]
"""
# Reshape to merge batch and roi dimensions for simplicity.
target_class_ids = K.reshape(target_class_ids, (-1,)) # 变成一个维度batch*num_rois
target_bbox = K.reshape(target_bbox, (-1, 4)) # [batch*num_rois,4]
pred_bbox = K.reshape(pred_bbox, (-1, K.int_shape(pred_bbox)[2], 4)) # [batch*num_rois,num_classes,4]
# Only positive ROIs contribute to the loss. And only
# the right class_id of each ROI. Get their indices.
positive_roi_ix = tf.where(target_class_ids > 0)[:, 0] # [[0],[1],[2],[3],[4]]将二维变成了[0 1 2 3 4]
positive_roi_class_ids = tf.cast(tf.gather(target_class_ids, positive_roi_ix), tf.int64) # 选择正样本的id序列,并提取出来
indices = tf.stack([positive_roi_ix, positive_roi_class_ids], axis=1) # 框的序列与class_id对应起来(全为正样本)
# Gather the deltas (predicted and true) that contribute to loss
target_bbox = tf.gather(target_bbox, positive_roi_ix) # 根据正样本序列,提取出
pred_bbox = tf.gather_nd(pred_bbox, indices)
# Smooth-L1 Loss
loss = K.switch(tf.size(target_bbox) > 0,smooth_l1_loss(y_true=target_bbox, y_pred=pred_bbox), tf.constant(0.0))
loss = K.mean(loss)
return loss
def mrcnn_mask_loss_graph(target_masks, target_class_ids, pred_masks):
"""Mask binary cross-entropy loss for the masks head.
target_masks: [batch, num_rois, height, width].
A float32 tensor of values 0 or 1. Uses zero padding to fill array.
target_class_ids: [batch, num_rois]. Integer class IDs. Zero padded.
pred_masks: [batch, proposals, height, width, num_classes] float32 tensor
with values from 0 to 1.
"""
# Reshape for simplicity. Merge first two dimensions into one.
target_class_ids = K.reshape(target_class_ids, (-1,))
mask_shape = tf.shape(target_masks)
target_masks = K.reshape(target_masks, (-1, mask_shape[2], mask_shape[3]))
pred_shape = tf.shape(pred_masks)
pred_masks = K.reshape(pred_masks,
(-1, pred_shape[2], pred_shape[3], pred_shape[4]))
# Permute predicted masks to [N, num_classes, height, width]
pred_masks = tf.transpose(pred_masks, [0, 3, 1, 2])
# Only positive ROIs contribute to the loss. And only
# the class specific mask of each ROI.
positive_ix = tf.where(target_class_ids > 0)[:, 0]
positive_class_ids = tf.cast(
tf.gather(target_class_ids, positive_ix), tf.int64)
indices = tf.stack([positive_ix, positive_class_ids], axis=1)
# Gather the masks (predicted and true) that contribute to loss
y_true = tf.gather(target_masks, positive_ix)
y_pred = tf.gather_nd(pred_masks, indices)
# Compute binary cross entropy. If no positive ROIs, then return 0.
# shape: [batch, roi, num_classes]
loss = K.switch(tf.size(y_true) > 0,
K.binary_crossentropy(target=y_true, output=y_pred),
tf.constant(0.0))
loss = K.mean(loss)
return loss
############################################################
# Data Generator
############################################################
def resize(image, output_shape, order=1, mode=‘constant‘, cval=0, clip=True,
preserve_range=False, anti_aliasing=False, anti_aliasing_sigma=None):
"""A wrapper for Scikit-Image resize().
Scikit-Image generates warnings on every call to resize() if it doesn‘t
receive the right parameters. The right parameters depend on the version
of skimage. This solves the problem by using different parameters per
version. And it provides a central place to control resizing defaults.
"""
if LooseVersion(skimage.__version__) >= LooseVersion("0.14"):
# New in 0.14: anti_aliasing. Default it to False for backward
# compatibility with skimage 0.13.
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range, anti_aliasing=anti_aliasing,
anti_aliasing_sigma=anti_aliasing_sigma)
else:
return skimage.transform.resize(
image, output_shape,
order=order, mode=mode, cval=cval, clip=clip,
preserve_range=preserve_range)
def resize_image(image, min_dim=None, max_dim=None, min_scale=None, mode="square"):
"""Resizes an image keeping the aspect ratio unchanged.
min_dim: if provided, resizes the image such that it‘s smaller dimension == min_dim
max_dim: if provided, ensures that the image longest side doesn‘t exceed this value.
min_scale: if provided, ensure that the image is scaled up by at least
this percent even if min_dim doesn‘t require it.
mode: Resizing mode.
none: No resizing. Return the image unchanged.
square: Resize and pad with zeros to get a square image of size [max_dim, max_dim].
pad64: Pads width and height with zeros to make them multiples of 64.
If min_dim or min_scale are provided, it scales the image up
before padding. max_dim is ignored in this mode.
The multiple of 64 is needed to ensure smooth scaling of feature
maps up and down the 6 levels of the FPN pyramid (2**6=64).
crop: Picks random crops from the image. First, scales the image based
on min_dim and min_scale, then picks a random crop of
size min_dim x min_dim. Can be used in training only.
max_dim is not used in this mode.
Returns:
image: the resized image
window: (y1, x1, y2, x2). If max_dim is provided, padding might
be inserted in the returned image. If so, this window is the
coordinates of the image part of the full image (excluding
the padding). The x2, y2 pixels are not included.
scale: The scale factor used to resize the image
padding: Padding added to the image [(top, bottom), (left, right), (0, 0)]
"""
# Keep track of image dtype and return results in the same dtype
image_dtype = image.dtype
# Default window (y1, x1, y2, x2) and default scale == 1.
h, w = image.shape[:2]
window = (0, 0, h, w)
scale = 1
padding = [(0, 0), (0, 0), (0, 0)]
if mode == "none":
return image, window, scale, padding
# Scale?
if min_dim:
# Scale up but not down
scale = max(1, min_dim / min(h, w)) # h, w是原始图片的高与宽
if min_scale and scale < min_scale: # min_scale是最小填充倍数的,至少要大于它
scale = min_scale
# Does it exceed max dim?
if max_dim and mode == "square":
image_max = max(h, w)
if round(image_max * scale) > max_dim: # 最终原图片最大边扩充不能超过最大max_dim维度,否则重新选择scale
scale = max_dim / image_max
# Resize image using bilinear interpolation
if scale != 1:
image = resize(image, (round(h * scale), round(w * scale)), preserve_range=True)
# 上一行代码对图像做了resize,那么会改变图像的尺寸,这是我不愿意看到的,我觉的这样会对缺陷特征有损失,
# 或者出现变异,因此小心这里的变化
# Need padding or cropping?
if mode == "square":
# Get new height and width
h, w = image.shape[:2] # 此时已经将原图按照scale进行了改变
top_pad = (max_dim - h) // 2
bottom_pad = max_dim - h - top_pad
left_pad = (max_dim - w) // 2
right_pad = max_dim - w - left_pad
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode=‘constant‘, constant_values=0) # 将改变的图片进行了填充
window = (top_pad, left_pad, h + top_pad, w + left_pad) # 保存经过resize后图片的真实大小
elif mode == "pad64":
h, w = image.shape[:2]
# Both sides must be divisible by 64
assert min_dim % 64 == 0, "Minimum dimension must be a multiple of 64"
# Height
if h % 64 > 0:
max_h = h - (h % 64) + 64
top_pad = (max_h - h) // 2
bottom_pad = max_h - h - top_pad
else:
top_pad = bottom_pad = 0
# Width
if w % 64 > 0:
max_w = w - (w % 64) + 64
left_pad = (max_w - w) // 2
right_pad = max_w - w - left_pad
else:
left_pad = right_pad = 0
padding = [(top_pad, bottom_pad), (left_pad, right_pad), (0, 0)]
image = np.pad(image, padding, mode=‘constant‘, constant_values=0)
window = (top_pad, left_pad, h + top_pad, w + left_pad)
else:
raise Exception("Mode {} not supported".format(mode))
return image.astype(image_dtype), window, scale, padding
def resize_mask(mask, scale, padding):
# scale是输入图像的尺寸变化,padding是最大维度的背景填充,mask有效坐标对应原来输入的图像中
"""Resizes a mask using the given scale and padding.
Typically, you get the scale and padding from resize_image() to
ensure both, the image and the mask, are resized consistently.
scale: mask scaling factor
padding: Padding to add to the mask in the form
[(top, bottom), (left, right), (0, 0)]
"""
# Suppress warning from scipy 0.13.0, the output shape of zoom() is
# calculated with round() instead of int()
# with warnings.catch_warnings():
# warnings.simplefilter("ignore")
mask = scipy.ndimage.zoom(mask, zoom=[scale, scale, 1], order=0)
# if crop is not None:
# y, x, h, w = crop
# mask = mask[y:y + h, x:x + w]
# else:
mask = np.pad(mask, padding, mode=‘constant‘, constant_values=0)
return mask
def extract_bboxes(mask): # [[num_instances, (y1, x1, y2, x2)]]
# in a word,bbox proced by mask will contain all mask which value equal 1.
"""Compute bounding boxes from masks.
mask: [height, width, num_instances]. Mask pixels are either 1 or 0.
Returns: bbox array [num_instances, (y1, x1, y2, x2)].
"""
boxes = np.zeros([mask.shape[-1], 4], dtype=np.int32)
# the last dimension for mask (num_instances) is bbox for instance every picture
for i in range(mask.shape[-1]):
m = mask[:, :, i]
# Bounding box.
horizontal_indicies = np.where(np.any(m, axis=0))[0]
vertical_indicies = np.where(np.any(m, axis=1))[0]
if horizontal_indicies.shape[0]:
x1, x2 = horizontal_indicies[[0, -1]]
y1, y2 = vertical_indicies[[0, -1]]
# x2 and y2 should not be part of the box. Increment by 1.
x2 += 1
y2 += 1
else:
# No mask for this instance. Might happen due to
# resizing or cropping. Set bbox to zeros
x1, x2, y1, y2 = 0, 0, 0, 0
boxes[i] = np.array([y1, x1, y2, x2])
return boxes.astype(np.int32)
def box_refinement(box, gt_box):
"""Compute refinement needed to transform box to gt_box.
box and gt_box are [N, (y1, x1, y2, x2)]. (y2, x2) is
assumed to be outside the box.
"""
box = box.astype(np.float32)
gt_box = gt_box.astype(np.float32)
height = box[:, 2] - box[:, 0]
width = box[:, 3] - box[:, 1]
center_y = box[:, 0] + 0.5 * height
center_x = box[:, 1] + 0.5 * width
gt_height = gt_box[:, 2] - gt_box[:, 0]
gt_width = gt_box[:, 3] - gt_box[:, 1]
gt_center_y = gt_box[:, 0] + 0.5 * gt_height
gt_center_x = gt_box[:, 1] + 0.5 * gt_width
dy = (gt_center_y - center_y) / height
dx = (gt_center_x - center_x) / width
dh = np.log(gt_height / height)
dw = np.log(gt_width / width)
return np.stack([dy, dx, dh, dw], axis=1)
def build_detection_targets(rpn_rois, gt_class_ids, gt_boxes, gt_masks, config):
"""Generate targets for training Stage 2 classifier and mask heads.
This is not used in normal training. It‘s useful for debugging or to train
the Mask RCNN heads without using the RPN head.
Inputs:
rpn_rois: [N, (y1, x1, y2, x2)] proposal boxes.
gt_class_ids: [instance count] Integer class IDs
gt_boxes: [instance count, (y1, x1, y2, x2)]
gt_masks: [height, width, instance count] Ground truth masks. Can be full
size or mini-masks.
Returns:
rois: [TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)]
class_ids: [TRAIN_ROIS_PER_IMAGE]. Integer class IDs.
bboxes: [TRAIN_ROIS_PER_IMAGE, NUM_CLASSES, (y, x, log(h), log(w))]. Class-specific
bbox refinements.
masks: [TRAIN_ROIS_PER_IMAGE, height, width, NUM_CLASSES). Class specific masks cropped
to bbox boundaries and resized to neural network output size.
"""
assert rpn_rois.shape[0] > 0
assert gt_class_ids.dtype == np.int32, "Expected int but got {}".format(
gt_class_ids.dtype)
assert gt_boxes.dtype == np.int32, "Expected int but got {}".format(
gt_boxes.dtype)
assert gt_masks.dtype == np.bool_, "Expected bool but got {}".format(
gt_masks.dtype)
# It‘s common to add GT Boxes to ROIs but we don‘t do that here because
# according to XinLei Chen‘s paper, it doesn‘t help.
# Trim empty padding in gt_boxes and gt_masks parts
instance_ids = np.where(gt_class_ids > 0)[0]
assert instance_ids.shape[0] > 0, "Image must contain instances."
gt_class_ids = gt_class_ids[instance_ids]
gt_boxes = gt_boxes[instance_ids]
gt_masks = gt_masks[:, :, instance_ids]
# Compute areas of ROIs and ground truth boxes.
rpn_roi_area = (rpn_rois[:, 2] - rpn_rois[:, 0]) *
(rpn_rois[:, 3] - rpn_rois[:, 1])
gt_box_area = (gt_boxes[:, 2] - gt_boxes[:, 0]) *
(gt_boxes[:, 3] - gt_boxes[:, 1])
# Compute overlaps [rpn_rois, gt_boxes]
overlaps = np.zeros((rpn_rois.shape[0], gt_boxes.shape[0]))
for i in range(overlaps.shape[1]):
gt = gt_boxes[i]
overlaps[:, i] = compute_iou(
gt, rpn_rois, gt_box_area[i], rpn_roi_area)
# Assign ROIs to GT boxes
rpn_roi_iou_argmax = np.argmax(overlaps, axis=1)
rpn_roi_iou_max = overlaps[np.arange(
overlaps.shape[0]), rpn_roi_iou_argmax]
# GT box assigned to each ROI
rpn_roi_gt_boxes = gt_boxes[rpn_roi_iou_argmax]
rpn_roi_gt_class_ids = gt_class_ids[rpn_roi_iou_argmax]
# Positive ROIs are those with >= 0.5 IoU with a GT box.
fg_ids = np.where(rpn_roi_iou_max > 0.5)[0]
# Negative ROIs are those with max IoU 0.1-0.5 (hard example mining)
# TODO: To hard example mine or not to hard example mine, that‘s the question
# bg_ids = np.where((rpn_roi_iou_max >= 0.1) & (rpn_roi_iou_max < 0.5))[0]
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
# Subsample ROIs. Aim for 33% foreground.
# FG
fg_roi_count = int(config.TRAIN_ROIS_PER_IMAGE * config.ROI_POSITIVE_RATIO)
if fg_ids.shape[0] > fg_roi_count:
keep_fg_ids = np.random.choice(fg_ids, fg_roi_count, replace=False)
else:
keep_fg_ids = fg_ids
# BG
remaining = config.TRAIN_ROIS_PER_IMAGE - keep_fg_ids.shape[0]
if bg_ids.shape[0] > remaining:
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
else:
keep_bg_ids = bg_ids
# Combine indices of ROIs to keep
keep = np.concatenate([keep_fg_ids, keep_bg_ids])
# Need more?
remaining = config.TRAIN_ROIS_PER_IMAGE - keep.shape[0]
if remaining > 0:
# Looks like we don‘t have enough samples to maintain the desired
# balance. Reduce requirements and fill in the rest. This is
# likely different from the Mask RCNN paper.
# There is a small chance we have neither fg nor bg samples.
if keep.shape[0] == 0:
# Pick bg regions with easier IoU threshold
bg_ids = np.where(rpn_roi_iou_max < 0.5)[0]
assert bg_ids.shape[0] >= remaining
keep_bg_ids = np.random.choice(bg_ids, remaining, replace=False)
assert keep_bg_ids.shape[0] == remaining
keep = np.concatenate([keep, keep_bg_ids])
else:
# Fill the rest with repeated bg rois.
keep_extra_ids = np.random.choice(
keep_bg_ids, remaining, replace=True)
keep = np.concatenate([keep, keep_extra_ids])
assert keep.shape[0] == config.TRAIN_ROIS_PER_IMAGE,
"keep doesn‘t match ROI batch size {}, {}".format(
keep.shape[0], config.TRAIN_ROIS_PER_IMAGE)
# Reset the gt boxes assigned to BG ROIs.
rpn_roi_gt_boxes[keep_bg_ids, :] = 0
rpn_roi_gt_class_ids[keep_bg_ids] = 0
# For each kept ROI, assign a class_id, and for FG ROIs also add bbox refinement.
rois = rpn_rois[keep]
roi_gt_boxes = rpn_roi_gt_boxes[keep]
roi_gt_class_ids = rpn_roi_gt_class_ids[keep]
roi_gt_assignment = rpn_roi_iou_argmax[keep]
# Class-aware bbox deltas. [y, x, log(h), log(w)]
bboxes = np.zeros((config.TRAIN_ROIS_PER_IMAGE,
config.NUM_CLASSES, 4), dtype=np.float32)
pos_ids = np.where(roi_gt_class_ids > 0)[0]
bboxes[pos_ids, roi_gt_class_ids[pos_ids]] = box_refinement(
rois[pos_ids], roi_gt_boxes[pos_ids, :4])
# Normalize bbox refinements
bboxes /= config.BBOX_STD_DEV
# Generate class-specific target masks
masks = np.zeros((config.TRAIN_ROIS_PER_IMAGE, config.MASK_SHAPE[0], config.MASK_SHAPE[1], config.NUM_CLASSES),
dtype=np.float32)
for i in pos_ids:
class_id = roi_gt_class_ids[i]
assert class_id > 0, "class id must be greater than 0"
gt_id = roi_gt_assignment[i]
class_mask = gt_masks[:, :, gt_id]
# Pick part of the mask and resize it
y1, x1, y2, x2 = rois[i].astype(np.int32)
m = class_mask[y1:y2, x1:x2]
mask = resize(m, config.MASK_SHAPE)
masks[i, :, :, class_id] = mask
return rois, roi_gt_class_ids, bboxes, masks
def compute_overlaps(boxes1, boxes2):
# each value in boxes2 compute with all boxes1,and calling compute_iou function
# finally, value save in [number_boxes1,number_boxes2]
"""Computes IoU overlaps between two sets of boxes.
boxes1, boxes2: [N, (y1, x1, y2, x2)].
For better performance, pass the largest set first and the smaller second.
"""
# Areas of anchors and GT boxes
area1 = (boxes1[:, 2] - boxes1[:, 0]) * (boxes1[:, 3] - boxes1[:, 1])
area2 = (boxes2[:, 2] - boxes2[:, 0]) * (boxes2[:, 3] - boxes2[:, 1])
# Compute overlaps to generate matrix [boxes1 count, boxes2 count]
# Each cell contains the IoU value.
overlaps = np.zeros((boxes1.shape[0], boxes2.shape[0])) # building variables for overlaps to save
for i in range(overlaps.shape[1]):
box2 = boxes2[i]
y1 = np.maximum(box2[0], boxes1[:, 0])
y2 = np.minimum(box2[2], boxes1[:, 2])
x1 = np.maximum(box2[1], boxes1[:, 1])
x2 = np.minimum(box2[3], boxes1[:, 3])
intersection = np.maximum(x2 - x1, 0) * np.maximum(y2 - y1, 0)
union = area2[i] + area1[:] - intersection[:]
overlaps[:, i] = intersection / union
return overlaps
def build_rpn_targets(anchors, gt_class_ids, gt_boxes, config):
print(‘mode_data_rpn_box‘)
"""Given the anchors and GT boxes, compute overlaps and identify positive
anchors and deltas to refine them to match their corresponding GT boxes.
anchors: [num_anchors, (y1, x1, y2, x2)]
gt_class_ids: [num_gt_boxes] Integer class IDs.
gt_boxes: [num_gt_boxes, (y1, x1, y2, x2)]
Returns:
rpn_match: [N] (int32) matches between anchors and GT boxes.
1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_bbox: [N, (dy, dx, log(dh), log(dw))] Anchor bbox deltas.
"""
# RPN Match: 1 = positive anchor, -1 = negative anchor, 0 = neutral
rpn_match = np.zeros([anchors.shape[0]], dtype=np.int32)
# RPN bounding boxes: [max anchors per image, (dy, dx, log(dh), log(dw))]
rpn_bbox = np.zeros((config.RPN_TRAIN_ANCHORS_PER_IMAGE, 4))
# Handle COCO crowds
# A crowd box in COCO is a bounding box around several instances. Exclude
# them from training. A crowd box is given a negative class ID.
crowd_ix = np.where(gt_class_ids < 0)[0]
if crowd_ix.shape[0] > 0:
# Filter out crowds from ground truth class IDs and boxes
non_crowd_ix = np.where(gt_class_ids > 0)[0]
crowd_boxes = gt_boxes[crowd_ix]
gt_class_ids = gt_class_ids[non_crowd_ix]
gt_boxes = gt_boxes[non_crowd_ix]
# Compute overlaps with crowd boxes [anchors, crowds]
crowd_overlaps = compute_overlaps(anchors, crowd_boxes)
crowd_iou_max = np.amax(crowd_overlaps, axis=1)
no_crowd_bool = (crowd_iou_max < 0.001)
else:
# All anchors don‘t intersect a crowd
no_crowd_bool = np.ones([anchors.shape[0]], dtype=bool)
# Compute overlaps [num_anchors, num_gt_boxes]
overlaps = compute_overlaps(anchors, gt_boxes)
# Match anchors to GT Boxes
# If an anchor overlaps a GT box with IoU >= 0.7 then it‘s positive.
# If an anchor overlaps a GT box with IoU < 0.3 then it‘s negative.
# Neutral anchors are those that don‘t match the conditions above,
# and they don‘t influence the loss function.
# However, don‘t keep any GT box unmatched (rare, but happens). Instead,
# match it to the closest anchor (even if its max IoU is < 0.3).
#
# 1. Set negative anchors first. They get overwritten below if a GT box is
# matched to them. Skip boxes in crowd areas.
anchor_iou_argmax = np.argmax(overlaps, axis=1)
anchor_iou_max = overlaps[np.arange(overlaps.shape[0]), anchor_iou_argmax]
rpn_match[(anchor_iou_max < 0.3) & (no_crowd_bool)] = -1
# 2. Set an anchor for each GT box (regardless of IoU value).
# If multiple anchors have the same IoU match all of them
gt_iou_argmax = np.argwhere(overlaps == np.max(overlaps, axis=0))[:,0]
rpn_match[gt_iou_argmax] = 1
# 3. Set anchors with high overlap as positive.
rpn_match[anchor_iou_max >= 0.7] = 1
# Subsample to balance positive and negative anchors
# Don‘t let positives be more than half the anchors
ids = np.where(rpn_match == 1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE // 2)
if extra > 0:
# Reset the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# Same for negative proposals
ids = np.where(rpn_match == -1)[0]
extra = len(ids) - (config.RPN_TRAIN_ANCHORS_PER_IMAGE -
np.sum(rpn_match == 1))
if extra > 0:
# Rest the extra ones to neutral
ids = np.random.choice(ids, extra, replace=False)
rpn_match[ids] = 0
# For positive anchors, compute shift and scale needed to transform them
# to match the corresponding GT boxes.
ids = np.where(rpn_match == 1)[0]
ix = 0 # index into rpn_bbox
# TODO: use box_refinement() rather than duplicating the code here
for i, a in zip(ids, anchors[ids]):
# Closest gt box (it might have IoU < 0.7)
gt = gt_boxes[anchor_iou_argmax[i]]
# Convert coordinates to center plus width/height.
# GT Box
gt_h = gt[2] - gt[0]
gt_w = gt[3] - gt[1]
gt_center_y = gt[0] + 0.5 * gt_h
gt_center_x = gt[1] + 0.5 * gt_w
# Anchor
a_h = a[2] - a[0]
a_w = a[3] - a[1]
a_center_y = a[0] + 0.5 * a_h
a_center_x = a[1] + 0.5 * a_w
# Compute the bbox refinement that the RPN should predict.
rpn_bbox[ix] = [
(gt_center_y - a_center_y) / a_h,
(gt_center_x - a_center_x) / a_w,
np.log(gt_h / a_h),
np.log(gt_w / a_w),
]
# Normalize
rpn_bbox[ix] /= config.RPN_BBOX_STD_DEV
ix += 1
return rpn_match, rpn_bbox
def generate_random_rois(image_shape, count, gt_boxes):
"""Generates ROI proposals similar to what a region proposal network
would generate.
image_shape: [Height, Width, Depth]
count: Number of ROIs to generate
gt_class_ids: [N] Integer ground truth class IDs
gt_boxes: [N, (y1, x1, y2, x2)] Ground truth boxes in pixels.
Returns: [count, (y1, x1, y2, x2)] ROI boxes in pixels.
"""
# placeholder
rois = np.zeros((count, 4), dtype=np.int32)
# Generate random ROIs around GT boxes (90% of count)
rois_per_box = int(0.9 * count / gt_boxes.shape[0])
for i in range(gt_boxes.shape[0]):
gt_y1, gt_x1, gt_y2, gt_x2 = gt_boxes[i]
h = gt_y2 - gt_y1
w = gt_x2 - gt_x1
# random boundaries
r_y1 = max(gt_y1 - h, 0)
r_y2 = min(gt_y2 + h, image_shape[0])
r_x1 = max(gt_x1 - w, 0)
r_x2 = min(gt_x2 + w, image_shape[1])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(r_y1, r_y2, (rois_per_box * 2, 2))
x1x2 = np.random.randint(r_x1, r_x2, (rois_per_box * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:rois_per_box]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:rois_per_box]
if y1y2.shape[0] == rois_per_box and x1x2.shape[0] == rois_per_box:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
box_rois = np.hstack([y1, x1, y2, x2])
rois[rois_per_box * i:rois_per_box * (i + 1)] = box_rois
# Generate random ROIs anywhere in the image (10% of count)
remaining_count = count - (rois_per_box * gt_boxes.shape[0])
# To avoid generating boxes with zero area, we generate double what
# we need and filter out the extra. If we get fewer valid boxes
# than we need, we loop and try again.
while True:
y1y2 = np.random.randint(0, image_shape[0], (remaining_count * 2, 2))
x1x2 = np.random.randint(0, image_shape[1], (remaining_count * 2, 2))
# Filter out zero area boxes
threshold = 1
y1y2 = y1y2[np.abs(y1y2[:, 0] - y1y2[:, 1]) >=
threshold][:remaining_count]
x1x2 = x1x2[np.abs(x1x2[:, 0] - x1x2[:, 1]) >=
threshold][:remaining_count]
if y1y2.shape[0] == remaining_count and x1x2.shape[0] == remaining_count:
break
# Sort on axis 1 to ensure x1 <= x2 and y1 <= y2 and then reshape
# into x1, y1, x2, y2 order
x1, x2 = np.split(np.sort(x1x2, axis=1), 2, axis=1)
y1, y2 = np.split(np.sort(y1y2, axis=1), 2, axis=1)
global_rois = np.hstack([y1, x1, y2, x2])
rois[-remaining_count:] = global_rois
return rois
def generate_pyramid_anchors(scales, ratios, feature_shapes, feature_strides,anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape=feature_shapes[i]
feature_stride=feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 实际得到box的宽与高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 实际得到box坐标中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原图img/stride缩放后以anchor_stride为步长取像素,
# 一此作为中心点,而后乘以feature_stride(stride)将像素中心放回原图像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
############################################################
# MaskRCNN Class
############################################################
class MaskRCNN( ):
"""Encapsulates the Mask RCNN model functionality.
The actual Keras model is in the keras_model property.
"""
# model_dir=D:MASKRCNNmask-rcnn-meMASKRCNN_myselfMask_RCNN-masterlogs
def __init__(self, mode, config):
"""
mode: Either "training" or "inference"
config: A Sub-class of the Config class
"""
assert mode in [‘training‘, ‘inference‘]
self.mode = mode
self.config = config
self.keras_model = self.build(mode=mode, config=config)
def build(self, mode, config):
"""Build Mask R-CNN architecture.
input_shape: The shape of the input image.
mode: Either "training" or "inference". The inputs and
outputs of the model differ accordingly.
"""
assert mode in [‘training‘, ‘inference‘]
# Image size must be dividable by 2 multiple times
h, w = config.IMAGE_SHAPE[:2] # 800 or 1024 have provided inherent numbers
if h / 2**6 != int(h / 2**6) or w / 2**6 != int(w / 2**6): # 这里就限定了下采样不会产生坐标误差
raise Exception("Image size must be dividable by 2 at least 6 times "
"to avoid fractions when downscaling and upscaling."
"For example, use 256, 320, 384, 448, 512, ... etc. ")
# Inputs
input_image = KL.Input(shape=[None, None, config.IMAGE_SHAPE[2]], name="input_image")
input_image_meta = KL.Input(shape=[config.IMAGE_META_SIZE], name="input_image_meta")
# 实际给出输入的变量是多了一个batch的维度
if mode == "training":
# RPN GT
input_rpn_match = KL.Input(shape=[None, 1], name="input_rpn_match", dtype=tf.int32)
input_rpn_bbox = KL.Input(shape=[None, 4], name="input_rpn_bbox", dtype=tf.float32)
# RPN_TRAIN_ANCHORS_PER_IMAGE = 256
# Detection GT (class IDs, bounding boxes, and masks)
# 1. GT Class IDs (zero padded)
input_gt_class_ids = KL.Input(shape=[None], name="input_gt_class_ids", dtype=tf.int32)
# 2. GT Boxes in pixels (zero padded)
# [batch, MAX_GT_INSTANCES, (y1, x1, y2, x2)] in image coordinates
input_gt_boxes = KL.Input(shape=[None, 4], name="input_gt_boxes", dtype=tf.float32)
# Normalize coordinates
gt_boxes = KL.Lambda(lambda x: norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_gt_boxes)
# 3. GT Masks (zero padded)
# [batch, height, width, MAX_GT_INSTANCES] MAX_GT_INSTANCES=100
# if config.USE_MINI_MASK: # USE_MINI_MASK=true
# input_gt_masks = KL.Input(shape=[config.MINI_MASK_SHAPE[0],config.MINI_MASK_SHAPE[1], None], name="input_gt_masks", dtype=bool) # MINI_MASK_SHAPE = (56, 56)
# else:
input_gt_masks = KL.Input(shape=[config.IMAGE_SHAPE[0], config.IMAGE_SHAPE[1], None], name= "input_gt_masks", dtype=bool) # 1024 or 512
elif mode == "inference":
# Anchors in normalized coordinates
input_anchors = KL.Input(shape=[None, 4], name="input_anchors")
# Build the shared convolutional layers.
# Bottom-up Layers
# Returns a list of the last layers of each stage, 5 in total.
# Don‘t create the thead (stage 5), so we pick the 4th item in the list.
# if callable(config.BACKBONE): # 检查一个函数是否可被调用 BACKBONE = "resnet101"
# _, C2, C3, C4, C5 = config.BACKBONE(input_image, stage5=True, train_bn=config.TRAIN_BN)
# # 上一行的代码应该是调用训练好的网络结构吧
# callable()方法用来检测对象是否可被调用,可被调用指的是对象能否使用()括号的方法调用
# else:
_, C2, C3, C4, C5 = resnet_graph(input_image, config.BACKBONE, stage5=True, train_bn=config.TRAIN_BN)
# Top-down Layers
# TODO: add assert to varify feature map sizes match what‘s in config
P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=‘fpn_c5p5‘)(C5) # config.TOP_DOWN_PYRAMID_SIZE=256
P4 = KL.Add(name="fpn_p4add")([KL.UpSampling2D(size=(2, 2), name="fpn_p5upsampled")(P5),KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=‘fpn_c4p4‘)(C4)]) # channel of the end bring into correspondence with other channel
P3 = KL.Add(name="fpn_p3add")([KL.UpSampling2D(size=(2, 2), name="fpn_p4upsampled")(P4), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=‘fpn_c3p3‘)(C3)])
P2 = KL.Add(name="fpn_p2add")([KL.UpSampling2D(size=(2, 2), name="fpn_p3upsampled")(P3), KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (1, 1), name=‘fpn_c2p2‘)(C2)])
# Attach 3x3 conv to all P layers to get the final feature maps.
P2 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p2")(P2)
P3 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p3")(P3)
P4 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p4")(P4)
P5 = KL.Conv2D(config.TOP_DOWN_PYRAMID_SIZE, (3, 3), padding="SAME", name="fpn_p5")(P5) # TOP_DOWN_PYRAMID_SIZE = 256
# P6 is used for the 5th anchor scale in RPN. Generated by
# subsampling from P5 with stride of 2.
P6 = KL.MaxPooling2D(pool_size=(1, 1), strides=2, name="fpn_p6")(P5)
# Note that P6 is used in RPN, but not in the classifier heads.
rpn_feature_maps = [P2, P3, P4, P5, P6]
mrcnn_feature_maps = [P2, P3, P4, P5]
# Anchors
if mode == "training":
anchors = self.get_anchors(config.IMAGE_SHAPE)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (config.batch_size,) + anchors.shape) # 将每一张图得到的boxes变成batch
# A hack to get around Keras‘s bad support for constants
anchors = KL.Lambda(lambda x: tf.Variable(anchors), name="anchors")(input_image) # ks接受tf的变量
else:
anchors = input_anchors
# RPN Model, 返回的是keras的Module对象, 注意keras中的Module对象是可call的
# RPN Model
rpn = build_rpn_model(config.RPN_ANCHOR_STRIDE, len(config.RPN_ANCHOR_RATIOS), config.TOP_DOWN_PYRAMID_SIZE)
# H * W * anchors_per_location 每一层都会有这么多框,而H=img/4(stride)
# config.TOP_DOWN_PYRAMID_SIZE=256 RPN_ANCHOR_RATIOS = [0.5, 1, 2] RPN_ANCHOR_STRIDE=1
# Loop through pyramid layers
layer_outputs = [] # list of lists # 保存各pyramid特征经过RPN之后的结果
for p in rpn_feature_maps: # rpn_feature_maps = [P2, P3, P4, P5, P6]
layer_outputs.append(rpn([p]))
# Concatenate layer outputs
# Convert from list of lists of level outputs to list of lists
# of outputs across levels.
# e.g. [[a1, b1, c1], [a2, b2, c2]] => [[a1, a2], [b1, b2], [c1, c2]]
output_names = ["rpn_class_logits", "rpn_class", "rpn_bbox"] # "rpn_class_logits"为分类 "rpn_class"为置信度 "rpn_bbox"为box
outputs = list(zip(*layer_outputs)) # [[logits2,……6], [class2,……6], [bbox2,……6]]
outputs = [KL.Concatenate(axis=1, name=n)(list(o)) for o, n in zip(outputs, output_names)] #
# [batch, num_anchors, 2/4]
# 其中num_anchors指的是全部特征层上的anchors总数
rpn_class_logits, rpn_class, rpn_bbox = outputs # "rpn_class_logits"为分类 "rpn_class"为置信度 "rpn_bbox"为box
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# Generate proposals
# Proposals are [batch, N, (y1, x1, y2, x2)] in normalized coordinates
# and zero padded.
# POST_NMS_ROIS_INFERENCE = 1000
# POST_NMS_ROIS_TRAINING = 2000
proposal_count = config.POST_NMS_ROIS_TRAINING if mode == "training"
else config.POST_NMS_ROIS_INFERENCE
‘‘‘
上一步我们获取了全部锚框的信息,这里我们的目的是从中挑选指定个数的更可能包含obj的锚框作为建议区域,
即我们希望获取在上一步的二分类中前景得分更高的框,同时,由于锚框生成算法的设计,
其数量巨大且重叠严重,我们在得分高低的基础上,进一步的希望能够去重(非极大值抑制),
这就是proposal生成的目的。proposal_count是一个整数,用于指定生成proposal数目,
不足时会生成坐标为[0,0,0,0]的空值进行补全。
这里的变量scores = inputs[0][:, :, 1],即我们只需要全部候选框的前景得分
‘‘‘
rpn_rois = ProposalLayer(
proposal_count=proposal_count, # proposal_count=2000 for train proposal_count=1000 for inference
nms_threshold=config.RPN_NMS_THRESHOLD, # 0.7 小于该阈值被保留
name="ROI",
config=config)([rpn_class, rpn_bbox, anchors])
if mode == "training":
# Class ID mask to mark class IDs supported by the dataset the image
# came from.
active_class_ids = KL.Lambda(lambda x: parse_image_meta_graph(x)["active_class_ids"])(input_image_meta)
if not config.USE_RPN_ROIS:
# Ignore predicted ROIs and use ROIs provided as an input.
input_rois = KL.Input(shape=[config.POST_NMS_ROIS_TRAINING, 4],name="input_roi", dtype=np.int32)
# Normalize coordinates
target_rois = KL.Lambda(lambda x: norm_boxes_graph(x, K.shape(input_image)[1:3]))(input_rois)
else:
target_rois = rpn_rois # 用rpn层
# Generate detection targets
# Subsamples proposals and generates target outputs for training
# Note that proposal class IDs, gt_boxes, and gt_masks are zero
# padded. Equally, returned rois and targets are zero padded.
rois, target_class_ids, target_bbox, target_mask = DetectionTargetLayer(config, name="proposal_targets")([target_rois, input_gt_class_ids, gt_boxes, input_gt_masks])
# rois=[batch_size,none,4], target_class_ids=[batch_size,none], target_bbox=deltas=[batch_size,none,4], target_mask=[batch_size,height,width]
# rois: [batch, TRAIN_ROIS_PER_IMAGE, (y1, x1, y2, x2)] in normalized coordinates
# target_class_ids: [batch, TRAIN_ROIS_PER_IMAGE].Integer class IDs.
# target_deltas: [batch, TRAIN_ROIS_PER_IMAGE, (dy, dx, log(dh), log(dw)]
# target_mask:[batch, TRAIN_ROIS_PER_IMAGE, height, width]
# Masks cropped to bbox boundaries and resized to neural network output size.
# Network Heads
# TODO: verify that this handles zero padded ROIs
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =fpn_classifier_graph(rois, mrcnn_feature_maps, input_image_meta,
config.POOL_SIZE, config.NUM_CLASSES, # config.POOL_SIZE = 7
train_bn=config.TRAIN_BN,
fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE) # FPN_CLASSIF_FC_LAYERS_SIZE = 1024
mrcnn_mask = build_fpn_mask_graph(rois, mrcnn_feature_maps,
input_image_meta,
config.MASK_POOL_SIZE,
config.NUM_CLASSES,
train_bn=config.TRAIN_BN)
# TODO: clean up (use tf.identify if necessary)
output_rois = KL.Lambda(lambda x: x * 1, name="output_rois")(rois)
# Losses
rpn_class_loss = KL.Lambda(lambda x: rpn_class_loss_graph(*x), name="rpn_class_loss")([input_rpn_match, rpn_class_logits])
rpn_bbox_loss = KL.Lambda(lambda x: rpn_bbox_loss_graph(config, *x), name="rpn_bbox_loss")([input_rpn_bbox, input_rpn_match, rpn_bbox])
class_loss = KL.Lambda(lambda x: mrcnn_class_loss_graph(*x), name="mrcnn_class_loss")([target_class_ids, mrcnn_class_logits, active_class_ids])
bbox_loss = KL.Lambda(lambda x: mrcnn_bbox_loss_graph(*x), name="mrcnn_bbox_loss")([target_bbox, target_class_ids, mrcnn_bbox])
mask_loss = KL.Lambda(lambda x: mrcnn_mask_loss_graph(*x), name="mrcnn_mask_loss")([target_mask, target_class_ids, mrcnn_mask])
# Model
inputs = [input_image, input_image_meta,
input_rpn_match, input_rpn_bbox, input_gt_class_ids, input_gt_boxes, input_gt_masks]
if not config.USE_RPN_ROIS:
inputs.append(input_rois)
outputs = [rpn_class_logits, rpn_class, rpn_bbox,
mrcnn_class_logits, mrcnn_class, mrcnn_bbox, mrcnn_mask,
rpn_rois, output_rois,
rpn_class_loss, rpn_bbox_loss, class_loss, bbox_loss, mask_loss]
model = KM.Model(inputs, outputs, name=‘mask_rcnn‘)
else:
# Network Heads
# Proposal classifier and BBox regressor heads
mrcnn_class_logits, mrcnn_class, mrcnn_bbox =
fpn_classifier_graph(rpn_rois, mrcnn_feature_maps, input_image_meta,
config.POOL_SIZE, config.NUM_CLASSES,
train_bn=config.TRAIN_BN,
fc_layers_size=config.FPN_CLASSIF_FC_LAYERS_SIZE)
# Detections
# output is [batch, num_detections, (y1, x1, y2, x2, class_id, score)] in
# normalized coordinates
detections = DetectionLayer(config, name="mrcnn_detection")([rpn_rois, mrcnn_class, mrcnn_bbox, input_image_meta])
# Create masks for detections
detection_boxes = KL.Lambda(lambda x: x[..., :4])(detections)
mrcnn_mask = build_fpn_mask_graph(detection_boxes, mrcnn_feature_maps, input_image_meta, config.MASK_POOL_SIZE, config.NUM_CLASSES, train_bn=config.TRAIN_BN)
model = KM.Model([input_image, input_image_meta, input_anchors],[detections, mrcnn_class, mrcnn_bbox, mrcnn_mask, rpn_rois, rpn_class, rpn_bbox],name=‘mask_rcnn‘)
return model
def load_weights(self, filepath, by_name=False, exclude=None):
"""Modified version of the corresponding Keras function with
the addition of multi-GPU support and the ability to exclude
some layers from loading.
exclude: list of layer names to exclude
"""
import h5py
# Conditional import to support versions of Keras before 2.2
# TODO: remove in about 6 months (end of 2018)
try:
from keras.engine import saving
except ImportError:
# Keras before 2.2 used the ‘topology‘ namespace.
from keras.engine import topology as saving
if exclude:
by_name = True
if h5py is None:
raise ImportError(‘`load_weights` requires h5py.‘)
f = h5py.File(filepath, mode=‘r‘)
if ‘layer_names‘ not in f.attrs and ‘model_weights‘ in f:
f = f[‘model_weights‘]
# In multi-GPU training, we wrap the model. Get layers
# of the inner model because they have the weights.
keras_model = self.keras_model
layers = keras_model.inner_model.layers if hasattr(keras_model, "inner_model")
else keras_model.layers
# Exclude some layers
if exclude:
layers = filter(lambda l: l.name not in exclude, layers)
if by_name:
saving.load_weights_from_hdf5_group_by_name(f, layers)
else:
saving.load_weights_from_hdf5_group(f, layers)
if hasattr(f, ‘close‘):
f.close()
def compile(self, learning_rate, momentum):
"""Gets the model ready for training. Adds losses, regularization, and
metrics. Then calls the Keras compile() function.
"""
# Optimizer object
optimizer = keras.optimizers.SGD(
lr=learning_rate, momentum=momentum,
clipnorm=self.config.GRADIENT_CLIP_NORM)
# Add Losses
# First, clear previously set losses to avoid duplication
self.keras_model._losses = []
self.keras_model._per_input_losses = {}
loss_names = [
"rpn_class_loss", "rpn_bbox_loss",
"mrcnn_class_loss", "mrcnn_bbox_loss", "mrcnn_mask_loss"]
for name in loss_names:
layer = self.keras_model.get_layer(name)
if layer.output in self.keras_model.losses:
continue
loss = (
tf.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.add_loss(loss)
# Add L2 Regularization
# Skip gamma and beta weights of batch normalization layers.
reg_losses = [
keras.regularizers.l2(self.config.WEIGHT_DECAY)(w) / tf.cast(tf.size(w), tf.float32)
for w in self.keras_model.trainable_weights
if ‘gamma‘ not in w.name and ‘beta‘ not in w.name]
self.keras_model.add_loss(tf.add_n(reg_losses))
# Compile
self.keras_model.compile(optimizer=optimizer)
# Add metrics for losses
for name in loss_names:
if name in self.keras_model.metrics_names:
continue
layer = self.keras_model.get_layer(name)
self.keras_model.metrics_names.append(name)
loss = (
tf.reduce_mean(layer.output, keepdims=True)
* self.config.LOSS_WEIGHTS.get(name, 1.))
self.keras_model.metrics_tensors.append(loss)
def set_trainable(self, layer_regex, keras_model=None, indent=0):
"""Sets model layers as trainable if their names match
the given regular expression.
hasattr() 函数用于判断对象是否包含对应的属性
"""
keras_model = keras_model or self.keras_model
layers =keras_model.layers
for layer in layers:
# Is the layer a model?
if layer.__class__.__name__ == ‘Model‘:
print("In model: ", layer.name)
self.set_trainable(
layer_regex, keras_model=layer, indent=indent + 4)
continue
if not layer.weights:
continue
# Is it trainable?
trainable = bool(re.fullmatch(layer_regex, layer.name))
# 上面是一个Bool型,表示冻结还是不冻结
# Update layer. If layer is a container, update inner layer.
if layer.__class__.__name__ == ‘TimeDistributed‘:
layer.layer.trainable = trainable
else:
layer.trainable = trainable
def train(self, train_dataset, learning_rate, epochs, layers, custom_callbacks=None):
"""Train the model.
train_dataset, val_dataset: Training and validation Dataset objects.
learning_rate: The learning rate to train with
epochs: Number of training epochs. Note that previous training epochs
are considered to be done alreay, so this actually determines
the epochs to train in total rather than in this particaular
call.
layers: Allows selecting wich layers to train. It can be:
- A regular expression to match layer names to train
- One of these predefined values:
heads: The RPN, classifier and mask heads of the network
all: All the layers
3+: Train Resnet stage 3 and up
4+: Train Resnet stage 4 and up
5+: Train Resnet stage 5 and up
augmentation: Optional. An imgaug (https://github.com/aleju/imgaug)
augmentation. For example, passing imgaug.augmenters.Fliplr(0.5)
flips images right/left 50% of the time. You can pass complex
augmentations as well. This augmentation applies 50% of the
time, and when it does it flips images right/left half the time
and adds a Gaussian blur with a random sigma in range 0 to 5.
augmentation = imgaug.augmenters.Sometimes(0.5, [
imgaug.augmenters.Fliplr(0.5),
imgaug.augmenters.GaussianBlur(sigma=(0.0, 5.0))
])
custom_callbacks: Optional. Add custom callbacks to be called
with the keras fit_generator method. Must be list of type keras.callbacks.
no_augmentation_sources: Optional. List of sources to exclude for
augmentation. A source is string that identifies a dataset and is
defined in the Dataset class.
"""
assert self.mode == "training", "Create model in training mode."
# Pre-defined layer regular expressions
layer_regex = {
# all layers but the backbone
"heads": r"(mrcnn1\\_.*)|(rpn\\_.*)|(fpn\\_.*)",
# From a specific Resnet stage and up
"3+": r"(res3.*)|(bn3.*)|(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn1\\_.*)|(rpn\\_.*)|(fpn\\_.*)",
"4+": r"(res4.*)|(bn4.*)|(res5.*)|(bn5.*)|(mrcnn1\\_.*)|(rpn\\_.*)|(fpn\\_.*)",
"5+": r"(res5.*)|(bn5.*)|(mrcnn1\\_.*)|(rpn\\_.*)|(fpn\\_.*)",
# All layers
"all": ".*",
}
if layers in layer_regex.keys():
layers = layer_regex[layers]
# Callbacks
callbacks = [
# keras.callbacks.TensorBoard(log_dir=self.log_dir, histogram_freq=0, write_graph=True, write_images=False),
keras.callbacks.ModelCheckpoint("C:Users51102Desktopmaskrcnn(tangjun)log{epoch:02d}.h5", verbose=0, save_weights_only=True),
]
self.set_trainable(layers)
self.compile(learning_rate, self.config.LEARNING_MOMENTUM)
self.keras_model.fit_generator(
train_dataset,
initial_epoch=0,#self.epoch,
epochs=epochs,
steps_per_epoch=self.config.STEPS_PER_EPOCH,
callbacks=callbacks,
# validation_data=val_generator,
# validation_steps=self.config.VALIDATION_STEPS,
# max_queue_size=100,
# workers=workers,
# use_multiprocessing=True,
)
def mold_inputs(self, images):
"""Takes a list of images and modifies them to the format expected
as an input to the neural network.
images: List of image matrices [height,width,depth]. Images can have
different sizes.
Returns 3 Numpy matrices:
molded_images: [N, h, w, 3]. Images resized and normalized.
image_metas: [N, length of meta data]. Details about each image.
windows: [N, (y1, x1, y2, x2)]. The portion of the image that has the
original image (padding excluded).
"""
molded_images = []
image_metas = []
windows = []
for image in images:
# Resize image
# TODO: move resizing to mold_image()
molded_image, window, scale, padding = resize_image(
image,
min_dim=self.config.IMAGE_MIN_DIM,
min_scale=self.config.IMAGE_MIN_SCALE,
max_dim=self.config.IMAGE_MAX_DIM,
mode=self.config.IMAGE_RESIZE_MODE)
molded_image = molded_image.astype(np.float32) - self.config.MEAN_PIXEL # 减平均像素
# Build image_meta 形式为np数组
# Build image_meta
image_meta = np.array(
[0] + # size=1
list(image.shape) + # size=3
list(molded_image.shape) + # size=3
list(window) + # size=4 (y1, x1, y2, x2) in image cooredinates
[scale] + # size=1
list(np.zeros([self.config.NUM_CLASSES], dtype=np.int32)) # size=num_classes
)
# Append
molded_images.append(molded_image)
windows.append(window)
image_metas.append(image_meta)
# Pack into arrays
molded_images = np.stack(molded_images)
image_metas = np.stack(image_metas)
windows = np.stack(windows)
return molded_images, image_metas, windows
def unmold_detections(self, detections, mrcnn_mask, original_image_shape, image_shape, window):
"""Reformats the detections of one image from the format of the neural
network output to a format suitable for use in the rest of the
application.
detections: [N, (y1, x1, y2, x2, class_id, score)] in normalized coordinates
mrcnn_mask: [N, height, width, num_classes]
original_image_shape: [H, W, C] Original image shape before resizing
image_shape: [H, W, C] Shape of the image after resizing and padding
window: [y1, x1, y2, x2] Pixel coordinates of box in the image where the real
image is excluding the padding.
Returns:
boxes: [N, (y1, x1, y2, x2)] Bounding boxes in pixels
class_ids: [N] Integer class IDs for each bounding box
scores: [N] Float probability scores of the class_id
masks: [height, width, num_instances] Instance masks
"""
# How many detections do we have?
# Detections array is padded with zeros. Find the first class_id == 0.
zero_ix = np.where(detections[:, 4] == 0)[0] # 去除为0的
N = zero_ix[0] if zero_ix.shape[0] > 0 else detections.shape[0] # 有意义的检测结果数N
# N 是 detections中box为非0的数量,因为detections[:,4]是先有值,表示有检测box,没有检测到的
# box值为0,因此zeros_ix[0]刚好表示有值的个数
# Extract boxes, class_ids, scores, and class-specific masks
boxes = detections[:N, :4] # [N, (y1, x1, y2, x2)] 提取有值的box
class_ids = detections[:N, 4].astype(np.int32) # [N, class_id] 提取对应的类
scores = detections[:N, 5] # [N, score] 提取对应的置信度
masks = mrcnn_mask[np.arange(N), :, :, class_ids] # [N, height, width, num_classes] 提取对应的msask
# Translate normalized coordinates in the resized image to pixel
# coordinates in the original image before resizing
# 下面将resize后的图像在对应的window中规范化
h, w = image_shape[:2] # image_shape 为resize后图像图像的画布,该画布包含输入图片的resize,也是预测输入尺寸
scale_norm = np.array([h - 1, w - 1, h - 1, w - 1])
shift_norm = np.array([0, 0, 1, 1])
# window 为输入图像所占画布尺寸
window = np.divide((window - shift_norm), scale_norm).astype(np.float32)
# window = norm_boxes(window, image_shape[:2]) # window相对输入图片规范化
wy1, wx1, wy2, wx2 = window
shift = np.array([wy1, wx1, wy1, wx1])
wh = wy2 - wy1 # window height
ww = wx2 - wx1 # window width
scale = np.array([wh, ww, wh, ww])
# Convert boxes to normalized coordinates on the window
boxes = np.divide(boxes - shift, scale) # box相对window坐标规范化,经过上面的处理
# Convert boxes to pixel coordinates on the original image
# boxes = denorm_boxes(boxes, original_image_shape[:2]) # box相对原图解规范化
h, w = original_image_shape[:2]
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
boxes = np.around(np.multiply(boxes, scale) + shift).astype(np.int32)
# Filter out detections with zero area. Happens in early training when
# network weights are still random
exclude_ix = np.where((boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) <= 0)[0]
# 上一行代码是h*w判断是否box不对,然后排除,给出是h*w小于0的位置编号
if exclude_ix.shape[0] > 0: # 如果有就删除
boxes = np.delete(boxes, exclude_ix, axis=0)
class_ids = np.delete(class_ids, exclude_ix, axis=0)
scores = np.delete(scores, exclude_ix, axis=0)
masks = np.delete(masks, exclude_ix, axis=0)
N = class_ids.shape[0]
# Resize masks to original image size and set boundary threshold.
full_masks = []
for i in range(N): # 单个box操作
# Convert neural network mask to full size mask
# 以下代码是将预测的mask进行resize并填充到真实图像对应的区域
"""Converts a mask generated by the neural network to a format similar
to its original shape.
mask: [height, width] of type float. A small, typically 28x28 mask.
bbox: [y1, x1, y2, x2]. The box to fit the mask in.
Returns a binary mask with the same size as the original image.
"""
threshold = 0.5 # 这是决定是否有mask展现的阈值
y1, x1, y2, x2 = boxes[i]
mask_temp = resize(masks[i], (y2 - y1, x2 - x1)) # 将预测的mask resize 到box中
mask_temp = np.where(mask_temp >= threshold, 1, 0).astype(np.bool)
# Put the mask in the right location.
full_mask = np.zeros(original_image_shape[:2], dtype=np.bool)
full_mask[y1:y2, x1:x2] = mask_temp # full_mask 为二维的
full_masks.append(full_mask) # 将其添加到列表中
# np.stack 表示增加一个维度,这里为二维,增加一维是-1的维度为2
full_masks = np.stack(full_masks, axis=-1)
if full_masks else np.empty(original_image_shape[:2] + (0,))
# [n, (y1, x1, y2, x2)]
# [n, class_id]
# [n, class_id]
# [h, w, n]
return boxes, class_ids, scores, full_masks
def detect(self, images, log_print=0):
"""Runs the detection pipeline.
images: List of images, potentially of different sizes.
Returns a list of dicts, one dict per image. The dict contains:
rois: [N, (y1, x1, y2, x2)] detection bounding boxes
class_ids: [N] int class IDs
scores: [N] float probability scores for the class IDs
masks: [H, W, N] instance binary masks
"""
assert self.mode == "inference", "Create model in inference mode."
assert len(images) == self.config.batch_size, "len(images) must be equal to BATCH_SIZE"
# Mold inputs to format expected by the neural network
molded_images, image_metas, windows = self.mold_inputs(images)
# Validate image sizes
# All images in a batch MUST be of the same size
image_shape = molded_images[0].shape
for g in molded_images[1:]:
assert g.shape == image_shape,
"After resizing, all images must have the same size. Check IMAGE_RESIZE_MODE and image sizes."
# Anchors
anchors = self.get_anchors(image_shape)
# Duplicate across the batch dimension because Keras requires it
# TODO: can this be optimized to avoid duplicating the anchors?
anchors = np.broadcast_to(anchors, (self.config.batch_size,) + anchors.shape)
# 日志记录
if log_print:
log("molded_images", molded_images)
log("image_metas", image_metas)
log("anchors", anchors)
detections, _, _, mrcnn_mask, _, _, _ =
self.keras_model.predict([molded_images, image_metas, anchors], verbose=0)
# 根据模型得到预测时候想要的结果
# Process detections
results = [] # 建立空列表,保存最终结果
for i, image in enumerate(images):
# 需要单张处理,因为原始图片images不保证每张尺寸一致
final_rois, final_class_ids, final_scores, final_masks =
self.unmold_detections(detections[i], mrcnn_mask[i],
image.shape, molded_images[i].shape, windows[i])
# windows 是保存resize图像尺寸中真实图像分布在该尺寸中的坐标位置,左上角与右下角的坐标点
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
print(‘it is ok ‘)
return results
def generate_pyramid_anchors(self,scales, ratios, feature_shapes, feature_strides, anchor_stride):
"""Generate anchors at different levels of a feature pyramid. Each scale
is associated with a level of the pyramid, but each ratio is used in
all levels of the pyramid.
Returns:
anchors: [N, (y1, x1, y2, x2)]. All generated anchors in one array. Sorted
with the same order of the given scales. So, anchors of scale[0] come
first, then anchors of scale[1], and so on.
"""
# Anchors
# [anchor_count, (y1, x1, y2, x2)]
anchors = []
for i in range(len(scales)):
# anchors.append(generate_anchors(scales[i], ratios, feature_shapes[i], feature_strides[i], anchor_stride))
"""
scales: 1D array of anchor sizes in pixels. Example: [32, 64, 128]
ratios: 1D array of anchor ratios of width/height. Example: [0.5, 1, 2]
shape: [height, width] spatial shape of the feature map over which to generate anchors.
feature_stride: Stride of the feature map relative to the image in pixels.
anchor_stride: Stride of anchors on the feature map. For example, if the value is 2 then generate anchors for every other feature map pixel.
"""
# Get all combinations of scales and ratios
scale, ratios = np.meshgrid(np.array(scales[i]), np.array(ratios))
scale = scale.flatten()
ratios = ratios.flatten()
shape = feature_shapes[i]
feature_stride = feature_strides[i]
# Enumerate heights and widths from scales and ratios
# 实际得到box的宽与高
heights = scale / np.sqrt(ratios)
widths = scale * np.sqrt(ratios)
# Enumerate shifts in feature space
# 实际得到box坐标中心
shifts_y = np.arange(0, shape[0],
anchor_stride) * feature_stride # anchor_stride 表示原图img/stride缩放后以anchor_stride为步长取像素,
# 一此作为中心点,而后乘以feature_stride(stride)将像素中心放回原图像位置中。
shifts_x = np.arange(0, shape[1], anchor_stride) * feature_stride
shifts_x, shifts_y = np.meshgrid(shifts_x, shifts_y)
# Enumerate combinations of shifts, widths, and heights
box_widths, box_centers_x = np.meshgrid(widths, shifts_x)
box_heights, box_centers_y = np.meshgrid(heights, shifts_y)
# Reshape to get a list of (y, x) and a list of (h, w)
box_centers = np.stack([box_centers_y, box_centers_x], axis=2).reshape([-1, 2])
box_sizes = np.stack([box_heights, box_widths], axis=2).reshape([-1, 2])
# code above make center of bboxes and height width of bboxes
# Convert to corner coordinates (y1, x1, y2, x2)
boxes = np.concatenate([box_centers - 0.5 * box_sizes, box_centers + 0.5 * box_sizes], axis=1)
# convert center height and width coordinate of bbox to four coordinates which respectively represnt top left corner and lower right corner
anchors.append(boxes)
return np.concatenate(anchors, axis=0)
def get_anchors(self, image_shape):
"""Returns anchor pyramid for the given image size."""
# [N, (height, width)]
backbone_shapes = compute_backbone_shapes(self.config, image_shape)
# Cache anchors and reuse if image shape is the same
if not hasattr(self, "_anchor_cache"):
self._anchor_cache = {}
if not tuple(image_shape) in self._anchor_cache:
# Generate Anchors: [anchor_count, (y1, x1, y2, x2)]
# Generate Anchors
a = self.generate_pyramid_anchors(
self.config.RPN_ANCHOR_SCALES,# (32, 64, 128, 256, 512)->16, 32,64, 128, 256
self.config.RPN_ANCHOR_RATIOS, # [0.5, 1, 2]
backbone_shapes, # with shape [N, (height, width)]
self.config.BACKBONE_STRIDES,# [4, 8, 16, 32, 64]
self.config.RPN_ANCHOR_STRIDE)# 1
# Keep a copy of the latest anchors in pixel coordinates because
# it‘s used in inspect_model notebooks.
# TODO: Remove this after the notebook are refactored to not use it
self.anchors = a #[n,4]
# Normalize coordinates
# self._anchor_cache[tuple(image_shape)] = utils.norm_boxes(a, image_shape[:2])
# 进行归一化
h, w = image_shape[:2]
scale = np.array([h - 1, w - 1, h - 1, w - 1])
shift = np.array([0, 0, 1, 1])
self._anchor_cache[tuple(image_shape)]= np.divide((a - shift), scale).astype(np.float32)
return self._anchor_cache[tuple(image_shape)]
def find_trainable_layer(self, layer):
"""If a layer is encapsulated by another layer, this function
digs through the encapsulation and returns the layer that holds
the weights.
"""
if layer.__class__.__name__ == ‘TimeDistributed‘:
return self.find_trainable_layer(layer.layer)
return layer
def get_trainable_layers(self):
"""Returns a list of layers that have weights."""
layers = []
# Loop through all layers
for l in self.keras_model.layers:
# If layer is a wrapper, find inner trainable layer
l = self.find_trainable_layer(l)
# Include layer if it has weights
if l.get_weights():
layers.append(l)
return layers
def parse_image_meta_graph(meta):
image_id = meta[:, 0]
original_image_shape = meta[:, 1:4]
image_shape = meta[:, 4:7]
window = meta[:, 7:11] # (y1, x1, y2, x2) window of image in in pixels
scale = meta[:, 11]
active_class_ids = meta[:, 12:]
return {
"image_id": image_id,
"original_image_shape": original_image_shape,
"image_shape": image_shape,
"window": window,
"scale": scale,
"active_class_ids": active_class_ids,
}
############################################################
# Miscellenous Graph Functions
############################################################
def trim_zeros_graph(boxes, name=‘trim_zeros‘):
"""Often boxes are represented with matrices of shape [N, 4] and
are padded with zeros. This removes zero boxes.
boxes: [N, 4] matrix of boxes.
non_zeros: [N] a 1D boolean mask identifying the rows to keep
"""
non_zeros = tf.cast(tf.reduce_sum(tf.abs(boxes), axis=1), tf.bool)
boxes = tf.boolean_mask(boxes, non_zeros, name=name) # tf.boolean_mask 为True的保存下来
return boxes, non_zeros
# def batch_pack_graph(x, counts, num_rows):
# """Picks different number of values from each row in x depending on the values in counts.
# """
# outputs = []
# for i in range(num_rows):
# outputs.append(x[i, :counts[i]])
# return tf.concat(outputs, axis=0)
def norm_boxes_graph(boxes, shape):
"""Converts boxes from pixel coordinates to normalized coordinates.
boxes: [..., (y1, x1, y2, x2)] in pixel coordinates
shape: [..., (height, width)] in pixels
Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
coordinates it‘s inside the box.
Returns:
[..., (y1, x1, y2, x2)] in normalized coordinates
"""
h, w = tf.split(tf.cast(shape, tf.float32), 2) # h 在第一维度分成2个
scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
shift = tf.constant([0., 0., 1., 1.])
return tf.divide(boxes - shift, scale)
# def denorm_boxes_graph(boxes, shape):
# """Converts boxes from normalized coordinates to pixel coordinates.
# boxes: [..., (y1, x1, y2, x2)] in normalized coordinates
# shape: [..., (height, width)] in pixels
#
# Note: In pixel coordinates (y2, x2) is outside the box. But in normalized
# coordinates it‘s inside the box.
#
# Returns:
# [..., (y1, x1, y2, x2)] in pixel coordinates
# """
# h, w = tf.split(tf.cast(shape, tf.float32), 2)
# scale = tf.concat([h, w, h, w], axis=-1) - tf.constant(1.0)
# shift = tf.constant([0., 0., 1., 1.])
# return tf.cast(tf.round(tf.multiply(boxes, scale) + shift), tf.int32)
此代码为推理文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import scipy
import os
import random
import datetime
import re
import math
import logging
from collections import OrderedDict
import multiprocessing
import numpy as np
import tensorflow as tf
import keras
import keras.backend as K # keras中的后端backend及其相关函数
import keras.layers as KL
import keras.engine as KE
import keras.models as KM
import math
import os
import sys
import numpy as np
import cv2
import matplotlib.pyplot as plt
import yaml
from PIL import Image
import random
# from mrcnn1 import utils, model as modellib, visualize
# from mrcnn1 import utils, model as modellib, visualize
import model as modellib
import visualize
from distutils.version import LooseVersion
assert LooseVersion(tf.__version__) >= LooseVersion("1.3")
assert LooseVersion(keras.__version__) >= LooseVersion(‘2.0.8‘)
ROOT_DIR = os.getcwd() # 得到当前路径
sys.path.append(ROOT_DIR) # To find local version of the library
# Directory to save logs and trained models
MODEL_DIR = os.path.join(ROOT_DIR, "logs") # 在当前路径的logs文件路径
iter_num = 0
# Local path to trained weights file
COCO_MODEL_PATH = os.path.join(ROOT_DIR, "mask_rcnn_coco.h5") # 载入训练模型权重路径
class Config_config(object):
"""Base configuration class. For custom configurations, create a
sub-class that inherits from this one and override properties
that need to be changed.
"""
IMAGE_RESIZE_MODE = "square"
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
NUM_CLASSES = 1 + 4 # Override in sub-classes
PRE_NMS_LIMIT = 6000
IMAGE_CHANNEL_COUNT = 3
# Name the configurations. For example, ‘COCO‘, ‘Experiment 3‘, ...etc.
# Useful if your code needs to do things differently depending on which
# experiment is running.
NAME = "shapes" # Override in sub-classes
GPU_COUNT = 1
IMAGES_PER_GPU = 1
# Number of training steps per epoch
# This doesn‘t need to match the size of the training set. Tensorboard
# updates are saved at the end of each epoch, so setting this to a
# smaller number means getting more frequent TensorBoard updates.
# Validation stats are also calculated at each epoch end and they
# might take a while, so don‘t set this too small to avoid spending
# a lot of time on validation stats.
STEPS_PER_EPOCH = 3
# Number of validation steps to run at the end of every training epoch.
# A bigger number improves accuracy of validation stats, but slows
# down the training.
VALIDATION_STEPS = 50
# Backbone network architecture
# Supported values are: resnet50, resnet101.
# You can also provide a callable that should have the signature
# of model.resnet_graph. If you do so, you need to supply a callable
# to COMPUTE_BACKBONE_SHAPE as well
BACKBONE = "resnet101"
# Only useful if you supply a callable to BACKBONE. Should compute
# the shape of each layer of the FPN Pyramid.
# See model.compute_backbone_shapes
# COMPUTE_BACKBONE_SHAPE = None
# The strides of each layer of the FPN Pyramid. These values
# are based on a Resnet101 backbone.
BACKBONE_STRIDES = [4, 8, 16, 32, 64]
# Size of the fully-connected layers in the classification graph
FPN_CLASSIF_FC_LAYERS_SIZE = 1024
# Size of the top-down layers used to build the feature pyramid
TOP_DOWN_PYRAMID_SIZE = 256
# Number of classification classes (including background)
# Length of square anchor side in pixels
RPN_ANCHOR_SCALES = (8, 16, 32, 64, 128)
# Ratios of anchors at each cell (width/height)
# A value of 1 represents a square anchor, and 0.5 is a wide anchor
RPN_ANCHOR_RATIOS = [0.5, 1, 2]
# Anchor stride
# If 1 then anchors are created for each cell in the backbone feature map.
# If 2, then anchors are created for every other cell, and so on.
RPN_ANCHOR_STRIDE = 1
# Non-max suppression threshold to filter RPN proposals.
# You can increase this during training to generate more propsals.
RPN_NMS_THRESHOLD = 0.7
# How many anchors per image to use for RPN training
RPN_TRAIN_ANCHORS_PER_IMAGE = 256 # rpn数据需要此值,rpn网络也需要次之
# ROIs kept after non-maximum supression (training and inference)
POST_NMS_ROIS_TRAINING = 2000
POST_NMS_ROIS_INFERENCE = 1000
# If enabled, resizes instance masks to a smaller size to reduce
# memory load. Recommended when using high-resolution images.
USE_MINI_MASK = False
MINI_MASK_SHAPE = (56, 56) # (height, width) of the mini-mask
# Input image resizing
# Generally, use the "square" resizing mode for training and inferencing
# and it should work well in most cases. In this mode, images are scaled
# up such that the small side is = IMAGE_MIN_DIM, but ensuring that the
# scaling doesn‘t make the long side > IMAGE_MAX_DIM. Then the image is
# padded with zeros to make it a square so multiple images can be put
# in one batch.
# Available resizing modes:
# none: No resizing or padding. Return the image unchanged.
# square: Resize and pad with zeros to get a square image
# of size [max_dim, max_dim].
# pad64: Pads width and height with zeros to make them multiples of 64.
# If IMAGE_MIN_DIM or IMAGE_MIN_SCALE are not None, then it scales
# up before padding. IMAGE_MAX_DIM is ignored in this mode.
# The multiple of 64 is needed to ensure smooth scaling of feature
# maps up and down the 6 levels of the FPN pyramid (2**6=64).
# crop: Picks random crops from the image. First, scales the image based
# on IMAGE_MIN_DIM and IMAGE_MIN_SCALE, then picks a random crop of
# size IMAGE_MIN_DIM x IMAGE_MIN_DIM. Can be used in training only.
# IMAGE_MAX_DIM is not used in this mode.
IMAGE_RESIZE_MODE = "square"
# Minimum scaling ratio. Checked after MIN_IMAGE_DIM and can force further
# up scaling. For example, if set to 2 then images are scaled up to double
# the width and height, or more, even if MIN_IMAGE_DIM doesn‘t require it.
# Howver, in ‘square‘ mode, it can be overruled by IMAGE_MAX_DIM.
IMAGE_MIN_SCALE = 0
# Image mean (RGB)
MEAN_PIXEL = np.array([123.7, 116.8, 103.9])
# Number of ROIs per image to feed to classifier/mask heads
# The Mask RCNN paper uses 512 but often the RPN doesn‘t generate
# enough positive proposals to fill this and keep a positive:negative
# ratio of 1:3. You can increase the number of proposals by adjusting
# the RPN NMS threshold.
TRAIN_ROIS_PER_IMAGE = 100
# Percent of positive ROIs used to train classifier/mask heads
ROI_POSITIVE_RATIO = 0.33
# Pooled ROIs
POOL_SIZE = 7
MASK_POOL_SIZE = 14
# Shape of output mask
# To change this you also need to change the neural network mask branch
MASK_SHAPE = [28, 28]
# Maximum number of ground truth instances to use in one image
MAX_GT_INSTANCES = 100
# Bounding box refinement standard deviation for RPN and final detections.
RPN_BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
BBOX_STD_DEV = np.array([0.1, 0.1, 0.2, 0.2])
# Max number of final detections
DETECTION_MAX_INSTANCES = 100
# Minimum probability value to accept a detected instance
# ROIs below this threshold are skipped
DETECTION_MIN_CONFIDENCE = 0.9 # 大于就选择
# Non-maximum suppression threshold for detection
DETECTION_NMS_THRESHOLD = 0.15 # 小于就选择
# Learning rate and momentum
# The Mask RCNN paper uses lr=0.02, but on TensorFlow it causes
# weights to explode. Likely due to differences in optimzer
# implementation.
LEARNING_RATE = 0.001
LEARNING_MOMENTUM = 0.9
# Weight decay regularization
WEIGHT_DECAY = 0.0001
# Loss weights for more precise optimization.
# Can be used for R-CNN training setup.
LOSS_WEIGHTS = {
"rpn_class_loss": 1.,
"rpn_bbox_loss": 1.,
"mrcnn_class_loss": 1.,
"mrcnn_bbox_loss": 1.,
"mrcnn_mask_loss": 1.
}
# Use RPN ROIs or externally generated ROIs for training
# Keep this True for most situations. Set to False if you want to train
# the head branches on ROI generated by code rather than the ROIs from
# the RPN. For example, to debug the classifier head without having to
# train the RPN.
USE_RPN_ROIS = True
# Train or freeze batch normalization layers
# None: Train BN layers. This is the normal mode
# False: Freeze BN layers. Good when using a small batch size
# True: (don‘t use). Set layer in training mode even when inferencing
TRAIN_BN = True # Defaulting to False since batch size is often small
# Gradient norm clipping
GRADIENT_CLIP_NORM = 5.0
batch_size=1
def __init__(self):
"""Set values of computed attributes."""
# Effective batch size
# self.BATCH_SIZE = self.IMAGES_PER_GPU * self.GPU_COUNT
# Input image size
if self.IMAGE_RESIZE_MODE == "crop":
self.IMAGE_SHAPE = np.array([self.IMAGE_MIN_DIM, self.IMAGE_MIN_DIM, 3])
else:
self.IMAGE_SHAPE = np.array([self.IMAGE_MAX_DIM, self.IMAGE_MAX_DIM, 3])
# Image meta data length
# See compose_image_meta() for details
self.IMAGE_META_SIZE = 1 + 3 + 3 + 4 + 1 + self.NUM_CLASSES
def display(self):
"""Display Configuration values."""
print(" Configurations:")
for a in dir(self):
if not a.startswith("__") and not callable(getattr(self, a)):
print("{:30} {}".format(a, getattr(self, a)))
print(" ")
# 预测图片基本配置更改
class Predict_Config(Config_config):
GPU_COUNT = 1
IMAGES_PER_GPU = 1
IMAGE_MIN_DIM = 128
IMAGE_MAX_DIM = 256
batch_size = 1
def predict():
import skimage.io
config = Predict_Config()
config.display()
model = modellib.MaskRCNN(mode="inference", config=config)
model_path = ‘C:Users51102Desktopmaskrcnn(tangjun)log 4.h5‘
# Load trained weights (fill in path to trained weights here)
assert model_path != "", "Provide path to trained weights"
print("Loading weights from ", model_path)
model.load_weights(model_path, by_name=True)
class_names = [‘BG‘, ‘line_bulge‘,‘dot_concave‘,‘dot_bulge‘,‘Irregular_concave‘]
file_names =‘D:MASKRCNNmask-rcnn-meMASKRCNN_myself .bmp‘
# image = skimage.io.imread(os.path.join(IMAGE_DIR, random.choice(file_names)))
image = skimage.io.imread(file_names)
image=image[:, :, 0:3]
print(‘image=‘, image.shape)
# Run detection
results = model.detect([image], log_print=1)
‘‘‘
results.append({
"rois": final_rois,
"class_ids": final_class_ids,
"scores": final_scores,
"masks": final_masks})
‘‘‘
# Visualize results
r = results[0]
print(‘r=‘,r)
visualize.display_instances(image, r[‘rois‘], r[‘masks‘], r[‘class_ids‘], class_names, r[‘scores‘])
if __name__ == "__main__":
predict()
此代码为显示辅助文件.py
"""
MASKRCNN algrithm for object detection and instance segmentation
Written and modified by tang jun on JAN , 2019
if you have questions , please connect me by Email: tangjunjunfighter@163.com
"""
import cv2 as cv # 自己添加的模块
import os
import sys
import random
import itertools
import colorsys
import numpy as np
from skimage.measure import find_contours
import matplotlib.pyplot as plt
from matplotlib import patches, lines
from matplotlib.patches import Polygon
import IPython.display
# # Root directory of the project
# ROOT_DIR = os.path.abspath("../")
#
# # Import Mask RCNN
# sys.path.append(ROOT_DIR) # To find local version of the library
def random_colors(N, bright=True):
"""
Generate random colors.
To get visually distinct colors, generate them in HSV space then
convert to RGB.
"""
brightness = 1.0 if bright else 0.7
hsv = [(i / N, 1, brightness) for i in range(N)]
colors = list(map(lambda c: colorsys.hsv_to_rgb(*c), hsv))
random.shuffle(colors)
return colors
def apply_mask(image, mask, color, alpha=0.5):
"""Apply the given mask to the image.
"""
for c in range(3):
image[:, :, c] = np.where(mask == 1,
image[:, :, c] *
(1 - alpha) + alpha * color[c] * 255,
image[:, :, c])
return image
def display_instances(image, boxes, masks, class_ids, class_names,
scores=None, title="",
figsize=(16, 16), ax=None,
show_mask=True, show_bbox=True,
colors=None, captions=None):
"""
boxes: [num_instance, (y1, x1, y2, x2, class_id)] in image coordinates.
masks: [height, width, num_instances]
class_ids: [num_instances]
class_names: list of class names of the dataset
scores: (optional) confidence scores for each box
title: (optional) Figure title
show_mask, show_bbox: To show masks and bounding boxes or not
figsize: (optional) the size of the image
colors: (optional) An array or colors to use with each object
captions: (optional) A list of strings to use as captions for each object
"""
# Number of instances
N = boxes.shape[0]
if not N:
print(" *** No instances to display *** ")
else:
assert boxes.shape[0] == masks.shape[-1] == class_ids.shape[0]
# If no axis is passed, create one and automatically call show()
auto_show = False
if not ax:
_, ax = plt.subplots(1, figsize=figsize)
auto_show = True
# Generate random colors
colors = colors or random_colors(N)
# Show area outside image boundaries.
height, width = image.shape[:2]
ax.set_ylim(height + 10, -10)
ax.set_xlim(-10, width + 10)
ax.axis(‘off‘)
ax.set_title(title)
masked_image = image.astype(np.uint32).copy()
for i in range(N):
color = colors[i]
# Bounding box
if not np.any(boxes[i]):
# Skip this instance. Has no bbox. Likely lost in image cropping.
continue
y1, x1, y2, x2 = boxes[i]
# cv.rectangle(masked_image, (y1[0],x1[0]), (y2[0],x2[0]), (0, 250, 0), 2) # 自己添加代码
if show_bbox:
p = patches.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2,
alpha=0.7, linestyle="dashed",
edgecolor=color, facecolor=‘none‘)
ax.add_patch(p)
# Label
if not captions:
class_id = class_ids[i]
score = scores[i] if scores is not None else None
label = class_names[class_id]
caption = "{} {:.3f}".format(label, score) if score else label
else:
caption = captions[i]
ax.text(x1, y1 + 8, caption,
color=‘w‘, size=11, backgroundcolor="none")
# Mask
mask = masks[:, :, i]
if show_mask:
masked_image = apply_mask(masked_image, mask, color)
# Mask Polygon
# Pad to ensure proper polygons for masks that touch image edges.
padded_mask = np.zeros(
(mask.shape[0] + 2, mask.shape[1] + 2), dtype=np.uint8)
padded_mask[1:-1, 1:-1] = mask
contours = find_contours(padded_mask, 0.5)
for verts in contours:
# Subtract the padding and flip (y, x) to (x, y)
verts = np.fliplr(verts) - 1
p = Polygon(verts, facecolor="none", edgecolor=color)
ax.add_patch(p)
ax.imshow(masked_image.astype(np.uint8))
if auto_show:
plt.show()
return masked_image
以上是关于maskrcnn详细注解说明(超详细)的主要内容,如果未能解决你的问题,请参考以下文章