models/pascal_voc/ZF/faster_rcnn_alt_opt/stage1_rpn_train.pt
下面需要注意的是rpn_cls_score层为每个位置的9个anchor做的只是bg/fg的二分类,而不管具体是fg的话属于那一类别,rpn阶段完成这个任务就够了,后面fast rcnn可以对region proposal进行细分和位置精修。
#stage 1训练RPN时用的网络结构 name: "ZF" layer { name: 'input-data' type: 'Python' top: 'data' top: 'im_info' top: 'gt_boxes' python_param { module: 'roi_data_layer.layer'#对应lib/roi_data_layer/layer.py,在训练RPN时为网络输入roi,此时为gt box layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #前面是ZF网,提取特征用,各个阶段共享 #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RPN ============ layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } layer { name: 'rpn-data' type: 'Python' bottom: 'rpn_cls_score' bottom: 'gt_boxes' bottom: 'im_info' bottom: 'data' top: 'rpn_labels' top: 'rpn_bbox_targets' top: 'rpn_bbox_inside_weights' top: 'rpn_bbox_outside_weights' python_param { module: 'rpn.anchor_target_layer'#对应文件lib/rpn/anchor_target_layer.py,用于在原图上产生anchor,结合gt box训练rpn做box cls和box reg layer: 'AnchorTargetLayer' param_str: "'feat_stride': 16" } } layer { name: "rpn_loss_cls" type: "SoftmaxWithLoss" bottom: "rpn_cls_score_reshape" bottom: "rpn_labels" propagate_down: 1 propagate_down: 0 top: "rpn_cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "rpn_loss_bbox" type: "SmoothL1Loss" bottom: "rpn_bbox_pred" bottom: "rpn_bbox_targets" bottom: "rpn_bbox_inside_weights" bottom: "rpn_bbox_outside_weights" top: "rpn_loss_bbox" loss_weight: 1 smooth_l1_loss_param { sigma: 3.0 } } #========= RCNN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "dummy_roi_pool_conv5" type: "DummyData" top: "dummy_roi_pool_conv5" dummy_data_param { shape { dim: 1 dim: 9216 } data_filler { type: "gaussian" std: 0.01 } } } layer { name: "fc6" type: "InnerProduct" bottom: "dummy_roi_pool_conv5" top: "fc6" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } inner_product_param { num_output: 4096 } } layer { name: "silence_fc7" type: "Silence" bottom: "fc7" } lib/roi_data_layer/layer.py#coding:utf-8 # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """The data layer used during training to train a Fast R-CNN network. RoIDataLayer implements a Caffe Python layer. """ import caffe from fast_rcnn.config import cfg from roi_data_layer.minibatch import get_minibatch import numpy as np import yaml from multiprocessing import Process, Queue #为网络输入roi class RoIDataLayer(caffe.Layer): """Fast R-CNN data layer used for training.""" def _shuffle_roidb_inds(self): """Randomly permute the training roidb.""" if cfg.TRAIN.ASPECT_GROUPING: widths = np.array([r['width'] for r in self._roidb]) heights = np.array([r['height'] for r in self._roidb]) horz = (widths >= heights) vert = np.logical_not(horz) horz_inds = np.where(horz)[0] vert_inds = np.where(vert)[0] inds = np.hstack(( np.random.permutation(horz_inds), np.random.permutation(vert_inds))) inds = np.reshape(inds, (-1, 2)) row_perm = np.random.permutation(np.arange(inds.shape[0])) inds = np.reshape(inds[row_perm, :], (-1,)) self._perm = inds else: self._perm = np.random.permutation(np.arange(len(self._roidb))) self._cur = 0 #得到下一个batch训练用的图像的index,默认一次两张图片 def _get_next_minibatch_inds(self): """Return the roidb indices for the next minibatch.""" #如果所有图片都用完了,打乱顺序,roidb由每张图片的rois集合构成 if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): self._shuffle_roidb_inds() #从_cur记录的位置开始选择cfg.TRAIN.IMS_PER_BATCH张图片作为训练用 db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] self._cur += cfg.TRAIN.IMS_PER_BATCH return db_inds #取得训练用的blob def _get_next_minibatch(self): """Return the blobs to be used for the next minibatch. If cfg.TRAIN.USE_PREFETCH is True, then blobs will be computed in a separate process and made available through self._blob_queue. """ if cfg.TRAIN.USE_PREFETCH: return self._blob_queue.get() else: db_inds = self._get_next_minibatch_inds() minibatch_db = [self._roidb[i] for i in db_inds] #函数在lib/roi_data_layer/minibatch.py里实现 return get_minibatch(minibatch_db, self._num_classes) def set_roidb(self, roidb): """Set the roidb to be used by this layer during training.""" self._roidb = roidb self._shuffle_roidb_inds() if cfg.TRAIN.USE_PREFETCH: self._blob_queue = Queue(10) self._prefetch_process = BlobFetcher(self._blob_queue, self._roidb, self._num_classes) self._prefetch_process.start() # Terminate the child process when the parent exists def cleanup(): print 'Terminating BlobFetcher' self._prefetch_process.terminate() self._prefetch_process.join() import atexit atexit.register(cleanup) #该层初始化时调用 def setup(self, bottom, top): """Setup the RoIDataLayer.""" # parse the layer parameter string, which must be valid YAML layer_params = yaml.load(self.param_str_) self._num_classes = layer_params['num_classes'] self._name_to_top_map = {} # data blob: holds a batch of N images, each with 3 channels idx = 0 top[idx].reshape(cfg.TRAIN.IMS_PER_BATCH, 3, max(cfg.TRAIN.SCALES), cfg.TRAIN.MAX_SIZE) self._name_to_top_map['data'] = idx idx += 1 #如果要训练RPN网,roi是gt box if cfg.TRAIN.HAS_RPN: top[idx].reshape(1, 3) self._name_to_top_map['im_info'] = idx idx += 1 top[idx].reshape(1, 4) self._name_to_top_map['gt_boxes'] = idx idx += 1 #如果是训练fast rcnn则roi是之前RPN提取的region proposal else: # not using RPN # rois blob: holds R regions of interest, each is a 5-tuple # (n, x1, y1, x2, y2) specifying an image batch index n and a # rectangle (x1, y1, x2, y2) top[idx].reshape(1, 5) self._name_to_top_map['rois'] = idx idx += 1 # labels blob: R categorical labels in [0, ..., K] for K foreground # classes plus background top[idx].reshape(1) self._name_to_top_map['labels'] = idx idx += 1 if cfg.TRAIN.BBOX_REG: # bbox_targets blob: R bounding-box regression targets with 4 # targets per class top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_targets'] = idx idx += 1 # bbox_inside_weights blob: At most 4 targets per roi are active; # thisbinary vector sepcifies the subset of active targets top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_inside_weights'] = idx idx += 1 top[idx].reshape(1, self._num_classes * 4) self._name_to_top_map['bbox_outside_weights'] = idx idx += 1 print 'RoiDataLayer: name_to_top:', self._name_to_top_map assert len(top) == len(self._name_to_top_map) #作为输入前向计算 def forward(self, bottom, top): """Get blobs and copy them into this layer's top blob vector.""" blobs = self._get_next_minibatch() for blob_name, blob in blobs.iteritems(): top_ind = self._name_to_top_map[blob_name] # Reshape net's input blobs top[top_ind].reshape(*(blob.shape)) # Copy data into net's input blobs top[top_ind].data[...] = blob.astype(np.float32, copy=False) #不用反向传播 def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass class BlobFetcher(Process): """Experimental class for prefetching blobs in a separate process.""" def __init__(self, queue, roidb, num_classes): super(BlobFetcher, self).__init__() self._queue = queue self._roidb = roidb self._num_classes = num_classes self._perm = None self._cur = 0 self._shuffle_roidb_inds() # fix the random seed for reproducibility np.random.seed(cfg.RNG_SEED) def _shuffle_roidb_inds(self): """Randomly permute the training roidb.""" # TODO(rbg): remove duplicated code self._perm = np.random.permutation(np.arange(len(self._roidb))) self._cur = 0 def _get_next_minibatch_inds(self): """Return the roidb indices for the next minibatch.""" # TODO(rbg): remove duplicated code if self._cur + cfg.TRAIN.IMS_PER_BATCH >= len(self._roidb): self._shuffle_roidb_inds() db_inds = self._perm[self._cur:self._cur + cfg.TRAIN.IMS_PER_BATCH] self._cur += cfg.TRAIN.IMS_PER_BATCH return db_inds def run(self): print 'BlobFetcher started' while True: db_inds = self._get_next_minibatch_inds() minibatch_db = [self._roidb[i] for i in db_inds] blobs = get_minibatch(minibatch_db, self._num_classes) self._queue.put(blobs)
其中用到了lib/roi_data_layer/minibatch.py里的函数getminibatch
#coding:utf-8 # -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Compute minibatch blobs for training a Fast R-CNN network.""" import numpy as np import numpy.random as npr import cv2 from fast_rcnn.config import cfg from utils.blob import prep_im_for_blob, im_list_to_blob #采样产生训练用的rois的blob,可以直接作为caffe的输入 def get_minibatch(roidb, num_classes): """Given a roidb, construct a minibatch sampled from it.""" num_images = len(roidb) #从预设的训练尺度里随机抽样用作此次产生的batch里用的roi的尺度 # Sample random scales to use for each image in this batch random_scale_inds = npr.randint(0, high=len(cfg.TRAIN.SCALES), size=num_images) #BATCH_SIZE为一个minibatch里训练用的roi的数量 assert(cfg.TRAIN.BATCH_SIZE % num_images == 0), \ 'num_images ({}) must divide BATCH_SIZE ({})'. \ format(num_images, cfg.TRAIN.BATCH_SIZE) #每张图片上应该抽样得到的roi的数量 rois_per_image = cfg.TRAIN.BATCH_SIZE / num_images #前景roi的数量 fg_rois_per_image = np.round(cfg.TRAIN.FG_FRACTION * rois_per_image) #产生caffe能用的blob # Get the input image blob, formatted for caffe #_get_image_blob的实现在本文件的后面 im_blob, im_scales = _get_image_blob(roidb, random_scale_inds) blobs = {'data': im_blob} #训练RPN时 if cfg.TRAIN.HAS_RPN: assert len(im_scales) == 1, "Single batch only" assert len(roidb) == 1, "Single batch only" # gt boxes: (x1, y1, x2, y2, cls) #属于前景的roi的真实类别 gt_inds = np.where(roidb[0]['gt_classes'] != 0)[0] gt_boxes = np.empty((len(gt_inds), 5), dtype=np.float32) #gt_boxes[i]类似于(x1,y1,x2,y2,cls) gt_boxes[:, 0:4] = roidb[0]['boxes'][gt_inds, :] * im_scales[0] gt_boxes[:, 4] = roidb[0]['gt_classes'][gt_inds] blobs['gt_boxes'] = gt_boxes blobs['im_info'] = np.array( [[im_blob.shape[2], im_blob.shape[3], im_scales[0]]], dtype=np.float32) #训练fast rcnn时 else: # not using RPN # Now, build the region of interest and label blobs rois_blob = np.zeros((0, 5), dtype=np.float32) labels_blob = np.zeros((0), dtype=np.float32) bbox_targets_blob = np.zeros((0, 4 * num_classes), dtype=np.float32) bbox_inside_blob = np.zeros(bbox_targets_blob.shape, dtype=np.float32) # all_overlaps = [] for im_i in xrange(num_images): #_sample_rois实现在下面,实现从每张图片的rois里采样 labels, overlaps, im_rois, bbox_targets, bbox_inside_weights \ = _sample_rois(roidb[im_i], fg_rois_per_image, rois_per_image, num_classes) # Add to RoIs blob rois = _project_im_rois(im_rois, im_scales[im_i]) batch_ind = im_i * np.ones((rois.shape[0], 1)) rois_blob_this_image = np.hstack((batch_ind, rois)) rois_blob = np.vstack((rois_blob, rois_blob_this_image)) # Add to labels, bbox targets, and bbox loss blobs labels_blob = np.hstack((labels_blob, labels)) bbox_targets_blob = np.vstack((bbox_targets_blob, bbox_targets)) bbox_inside_blob = np.vstack((bbox_inside_blob, bbox_inside_weights)) # all_overlaps = np.hstack((all_overlaps, overlaps)) # For debug visualizations # _vis_minibatch(im_blob, rois_blob, labels_blob, all_overlaps) blobs['rois'] = rois_blob blobs['labels'] = labels_blob if cfg.TRAIN.BBOX_REG: blobs['bbox_targets'] = bbox_targets_blob blobs['bbox_inside_weights'] = bbox_inside_blob blobs['bbox_outside_weights'] = \ np.array(bbox_inside_blob > 0).astype(np.float32) return blobs #从一张图片的rois里采样得到roi def _sample_rois(roidb, fg_rois_per_image, rois_per_image, num_classes): """Generate a random sample of RoIs comprising foreground and background examples. """ # label = class RoI has max overlap with labels = roidb['max_classes'] overlaps = roidb['max_overlaps'] rois = roidb['boxes'] # Select foreground RoIs as those with >= FG_THRESH overlap fg_inds = np.where(overlaps >= cfg.TRAIN.FG_THRESH)[0] # Guard against the case when an image has fewer than fg_rois_per_image # foreground RoIs #fg_rois_per_this_image取fg_rois_per_this_image和fg_inds.size的较小的一个 fg_rois_per_this_image = np.minimum(fg_rois_per_image, fg_inds.size) # Sample foreground regions without replacement if fg_inds.size > 0: fg_inds = npr.choice( fg_inds, size=fg_rois_per_this_image, replace=False) # Select background RoIs as those within [BG_THRESH_LO, BG_THRESH_HI) bg_inds = np.where((overlaps < cfg.TRAIN.BG_THRESH_HI) & (overlaps >= cfg.TRAIN.BG_THRESH_LO))[0] # Compute number of background RoIs to take from this image (guarding # against there being fewer than desired) bg_rois_per_this_image = rois_per_image - fg_rois_per_this_image bg_rois_per_this_image = np.minimum(bg_rois_per_this_image, bg_inds.size) #这里如果正负样本数量相差太大会出问题,此时应该做正负样本平衡,这里没有做 # Sample foreground regions without replacement if bg_inds.size > 0: bg_inds = npr.choice( bg_inds, size=bg_rois_per_this_image, replace=False) # The indices that we're selecting (both fg and bg) keep_inds = np.append(fg_inds, bg_inds) # Select sampled values from various arrays: labels = labels[keep_inds] # Clamp labels for the background RoIs to 0 #设定背景roi的label为0 labels[fg_rois_per_this_image:] = 0 overlaps = overlaps[keep_inds] rois = rois[keep_inds] bbox_targets, bbox_inside_weights = _get_bbox_regression_labels( roidb['bbox_targets'][keep_inds, :], num_classes) return labels, overlaps, rois, bbox_targets, bbox_inside_weights def _get_image_blob(roidb, scale_inds): """Builds an input blob from the images in the roidb at the specified scales. """ num_images = len(roidb) processed_ims = [] im_scales = [] for i in xrange(num_images): #读取roi所在的图像 im = cv2.imread(roidb[i]['image']) #判断该roi是否是由水平翻转得到的 if roidb[i]['flipped']: #实现水平翻转 im = im[:, ::-1, :] #得到尺度 target_size = cfg.TRAIN.SCALES[scale_inds[i]] im, im_scale = prep_im_for_blob(im, cfg.PIXEL_MEANS, target_size, cfg.TRAIN.MAX_SIZE) im_scales.append(im_scale) processed_ims.append(im) #在lib/util/blob.py里实现 # Create a blob to hold the input images blob = im_list_to_blob(processed_ims) return blob, im_scales def _project_im_rois(im_rois, im_scale_factor): """Project image RoIs into the rescaled training image.""" rois = im_rois * im_scale_factor return rois def _get_bbox_regression_labels(bbox_target_data, num_classes): """Bounding-box regression targets are stored in a compact form in the roidb. This function expands those targets into the 4-of-4*K representation used by the network (i.e. only one class has non-zero targets). The loss weights are similarly expanded. Returns: bbox_target_data (ndarray): N x 4K blob of regression targets bbox_inside_weights (ndarray): N x 4K blob of loss weights """ clss = bbox_target_data[:, 0] bbox_targets = np.zeros((clss.size, 4 * num_classes), dtype=np.float32) bbox_inside_weights = np.zeros(bbox_targets.shape, dtype=np.float32) inds = np.where(clss > 0)[0] for ind in inds: cls = clss[ind] start = 4 * cls end = start + 4 bbox_targets[ind, start:end] = bbox_target_data[ind, 1:] bbox_inside_weights[ind, start:end] = cfg.TRAIN.BBOX_INSIDE_WEIGHTS return bbox_targets, bbox_inside_weights def _vis_minibatch(im_blob, rois_blob, labels_blob, overlaps): """Visualize a mini-batch for debugging.""" import matplotlib.pyplot as plt for i in xrange(rois_blob.shape[0]): rois = rois_blob[i, :] im_ind = rois[0] roi = rois[1:] im = im_blob[im_ind, :, :, :].transpose((1, 2, 0)).copy() im += cfg.PIXEL_MEANS im = im[:, :, (2, 1, 0)] im = im.astype(np.uint8) cls = labels_blob[i] plt.imshow(im) print 'class: ', cls, ' overlap: ', overlaps[i] plt.gca().add_patch( plt.Rectangle((roi[0], roi[1]), roi[2] - roi[0], roi[3] - roi[1], fill=False, edgecolor='r', linewidth=3) ) plt.show()
其中用到了其中用到了lib/utils/bolb.py里的函数
# -------------------------------------------------------- # Fast R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick # -------------------------------------------------------- """Blob helper functions.""" import numpy as np import cv2 def im_list_to_blob(ims): """Convert a list of images into a network input. Assumes images are already prepared (means subtracted, BGR order, ...). """ max_shape = np.array([im.shape for im in ims]).max(axis=0) num_images = len(ims) blob = np.zeros((num_images, max_shape[0], max_shape[1], 3), dtype=np.float32) for i in xrange(num_images): im = ims[i] blob[i, 0:im.shape[0], 0:im.shape[1], :] = im # Move channels (axis 3) to axis 1 # Axis order will become: (batch elem, channel, height, width) channel_swap = (0, 3, 1, 2) blob = blob.transpose(channel_swap) return blob def prep_im_for_blob(im, pixel_means, target_size, max_size): """Mean subtract and scale an image for use in a blob.""" im = im.astype(np.float32, copy=False) im -= pixel_means im_shape = im.shape im_size_min = np.min(im_shape[0:2]) im_size_max = np.max(im_shape[0:2]) im_scale = float(target_size) / float(im_size_min) # Prevent the biggest axis from being more than MAX_SIZE if np.round(im_scale * im_size_max) > max_size: im_scale = float(max_size) / float(im_size_max) im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale, interpolation=cv2.INTER_LINEAR) return im, im_scale lib/rpn/anchor_target_layer.py#coding:utf-8 # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import os import caffe import yaml from fast_rcnn.config import cfg import numpy as np import numpy.random as npr from generate_anchors import generate_anchors from utils.cython_bbox import bbox_overlaps from fast_rcnn.bbox_transform import bbox_transform DEBUG = False class AnchorTargetLayer(caffe.Layer): """ Assign anchors to ground-truth targets. Produces anchor classification labels and bounding-box regression targets. """ def setup(self, bottom, top): layer_params = yaml.load(self.param_str_) #设定anchor的三个尺度 anchor_scales = layer_params.get('scales', (8, 16, 32)) #以(8.5,8.5)为中心产生9个基准anchor self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] #其余的anchor以feat_stride为步长上下滑动产生,config.py里feat_stride设为16,为什么是16, #因为不管是VGG还是ZF,conv5之后的scale是原图的1/16,这样产生的achor基本均匀分布在整个原图 self._feat_stride = layer_params['feat_stride'] if DEBUG: print 'anchors:' print self._anchors print 'anchor shapes:' print np.hstack(( self._anchors[:, 2::4] - self._anchors[:, 0::4], self._anchors[:, 3::4] - self._anchors[:, 1::4], )) self._counts = cfg.EPS self._sums = np.zeros((1, 4)) self._squared_sums = np.zeros((1, 4)) self._fg_sum = 0 self._bg_sum = 0 self._count = 0 # allow boxes to sit over the edge by a small amount self._allowed_border = layer_params.get('allowed_border', 0) #获得featuremap的宽高 height, width = bottom[0].data.shape[-2:] if DEBUG: print 'AnchorTargetLayer: height', height, 'width', width A = self._num_anchors # labels top[0].reshape(1, 1, A * height, width) # bbox_targets top[1].reshape(1, A * 4, height, width) # bbox_inside_weights top[2].reshape(1, A * 4, height, width) # bbox_outside_weights top[3].reshape(1, A * 4, height, width) def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i # generate 9 anchor boxes centered on cell i # apply predicted bbox deltas at cell i to each of the 9 anchors # filter out-of-image anchors # measure GT overlap assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' # map of shape (..., H, W) height, width = bottom[0].data.shape[-2:] # GT boxes (x1, y1, x2, y2, label) gt_boxes = bottom[1].data # im_info im_info = bottom[2].data[0, :] if DEBUG: print '' print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) print 'height, width: ({}, {})'.format(height, width) print 'rpn: gt_boxes.shape', gt_boxes.shape print 'rpn: gt_boxes', gt_boxes # 1. Generate proposals from bbox deltas and shifted anchors shift_x = np.arange(0, width) * self._feat_stride shift_y = np.arange(0, height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] all_anchors = (self._anchors.reshape((1, A, 4)) + shifts.reshape((1, K, 4)).transpose((1, 0, 2))) all_anchors = all_anchors.reshape((K * A, 4)) total_anchors = int(K * A) # only keep anchors inside the image inds_inside = np.where( (all_anchors[:, 0] >= -self._allowed_border) & (all_anchors[:, 1] >= -self._allowed_border) & (all_anchors[:, 2] < im_info[1] + self._allowed_border) & # width (all_anchors[:, 3] < im_info[0] + self._allowed_border) # height )[0] if DEBUG: print 'total_anchors', total_anchors print 'inds_inside', len(inds_inside) #裁掉大小超出图片的anchor,inds_inside是在图像内部的anchor的索引数组 # keep only inside anchors anchors = all_anchors[inds_inside, :] if DEBUG: print 'anchors.shape', anchors.shape # label: 1 is positive, 0 is negative, -1 is dont care labels = np.empty((len(inds_inside), ), dtype=np.float32) labels.fill(-1) # overlaps between the anchors and the gt boxes # overlaps (ex, gt) overlaps = bbox_overlaps( np.ascontiguousarray(anchors, dtype=np.float), np.ascontiguousarray(gt_boxes, dtype=np.float)) argmax_overlaps = overlaps.argmax(axis=1) max_overlaps = overlaps[np.arange(len(inds_inside)), argmax_overlaps] gt_argmax_overlaps = overlaps.argmax(axis=0) gt_max_overlaps = overlaps[gt_argmax_overlaps, np.arange(overlaps.shape[1])] gt_argmax_overlaps = np.where(overlaps == gt_max_overlaps)[0] if not cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels first so that positive labels can clobber them labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 # fg label: for each gt, anchor with highest overlap labels[gt_argmax_overlaps] = 1 # fg label: above threshold IOU labels[max_overlaps >= cfg.TRAIN.RPN_POSITIVE_OVERLAP] = 1 if cfg.TRAIN.RPN_CLOBBER_POSITIVES: # assign bg labels last so that negative labels can clobber positives labels[max_overlaps < cfg.TRAIN.RPN_NEGATIVE_OVERLAP] = 0 #采样正负anchor,如果正负样本数量不均衡,需要保持正负样本的比例基本为1:1,太悬殊 #会使得算法漏检严重,下面的算法没有实现保持正负样本均衡 # subsample positive labels if we have too many num_fg = int(cfg.TRAIN.RPN_FG_FRACTION * cfg.TRAIN.RPN_BATCHSIZE) fg_inds = np.where(labels == 1)[0] if len(fg_inds) > num_fg: disable_inds = npr.choice( fg_inds, size=(len(fg_inds) - num_fg), replace=False) labels[disable_inds] = -1 # subsample negative labels if we have too many num_bg = cfg.TRAIN.RPN_BATCHSIZE - np.sum(labels == 1) bg_inds = np.where(labels == 0)[0] if len(bg_inds) > num_bg: disable_inds = npr.choice( bg_inds, size=(len(bg_inds) - num_bg), replace=False) labels[disable_inds] = -1 #print "was %s inds, disabling %s, now %s inds" % ( #len(bg_inds), len(disable_inds), np.sum(labels == 0)) bbox_targets = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_targets = _compute_targets(anchors, gt_boxes[argmax_overlaps, :]) bbox_inside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) bbox_inside_weights[labels == 1, :] = np.array(cfg.TRAIN.RPN_BBOX_INSIDE_WEIGHTS) bbox_outside_weights = np.zeros((len(inds_inside), 4), dtype=np.float32) if cfg.TRAIN.RPN_POSITIVE_WEIGHT < 0: # uniform weighting of examples (given non-uniform sampling) num_examples = np.sum(labels >= 0) positive_weights = np.ones((1, 4)) * 1.0 / num_examples negative_weights = np.ones((1, 4)) * 1.0 / num_examples else: assert ((cfg.TRAIN.RPN_POSITIVE_WEIGHT > 0) & (cfg.TRAIN.RPN_POSITIVE_WEIGHT < 1)) positive_weights = (cfg.TRAIN.RPN_POSITIVE_WEIGHT / np.sum(labels == 1)) negative_weights = ((1.0 - cfg.TRAIN.RPN_POSITIVE_WEIGHT) / np.sum(labels == 0)) bbox_outside_weights[labels == 1, :] = positive_weights bbox_outside_weights[labels == 0, :] = negative_weights if DEBUG: self._sums += bbox_targets[labels == 1, :].sum(axis=0) self._squared_sums += (bbox_targets[labels == 1, :] ** 2).sum(axis=0) self._counts += np.sum(labels == 1) means = self._sums / self._counts stds = np.sqrt(self._squared_sums / self._counts - means ** 2) print 'means:' print means print 'stdevs:' print stds # map up to original set of anchors labels = _unmap(labels, total_anchors, inds_inside, fill=-1) bbox_targets = _unmap(bbox_targets, total_anchors, inds_inside, fill=0) bbox_inside_weights = _unmap(bbox_inside_weights, total_anchors, inds_inside, fill=0) bbox_outside_weights = _unmap(bbox_outside_weights, total_anchors, inds_inside, fill=0) if DEBUG: print 'rpn: max max_overlap', np.max(max_overlaps) print 'rpn: num_positive', np.sum(labels == 1) print 'rpn: num_negative', np.sum(labels == 0) self._fg_sum += np.sum(labels == 1) self._bg_sum += np.sum(labels == 0) self._count += 1 print 'rpn: num_positive avg', self._fg_sum / self._count print 'rpn: num_negative avg', self._bg_sum / self._count # labels labels = labels.reshape((1, height, width, A)).transpose(0, 3, 1, 2) labels = labels.reshape((1, 1, A * height, width)) top[0].reshape(*labels.shape) top[0].data[...] = labels # bbox_targets bbox_targets = bbox_targets \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) top[1].reshape(*bbox_targets.shape) top[1].data[...] = bbox_targets # bbox_inside_weights bbox_inside_weights = bbox_inside_weights \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) assert bbox_inside_weights.shape[2] == height assert bbox_inside_weights.shape[3] == width top[2].reshape(*bbox_inside_weights.shape) top[2].data[...] = bbox_inside_weights # bbox_outside_weights bbox_outside_weights = bbox_outside_weights \ .reshape((1, height, width, A * 4)).transpose(0, 3, 1, 2) assert bbox_outside_weights.shape[2] == height assert bbox_outside_weights.shape[3] == width top[3].reshape(*bbox_outside_weights.shape) top[3].data[...] = bbox_outside_weights def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _unmap(data, count, inds, fill=0): """ Unmap a subset of item (data) back to the original set of items (of size count) """ if len(data.shape) == 1: ret = np.empty((count, ), dtype=np.float32) ret.fill(fill) ret[inds] = data else: ret = np.empty((count, ) + data.shape[1:], dtype=np.float32) ret.fill(fill) ret[inds, :] = data return ret def _compute_targets(ex_rois, gt_rois): """Compute bounding-box regression targets for an image.""" assert ex_rois.shape[0] == gt_rois.shape[0] assert ex_rois.shape[1] == 4 assert gt_rois.shape[1] == 5 return bbox_transform(ex_rois, gt_rois[:, :4]).astype(np.float32, copy=False) 用到了lib/rpn/generate_anchors.py里的函数
#coding:utf-8 # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import numpy as np #下面是产生的9个anchor的坐标,每个box为(xmin,ymin,xmax,ymax),每个box的中心都是(8.5,8.5),所以会有负值 # Verify that we compute the same anchors as Shaoqing's matlab implementation: # # >> load output/rpn_cachedir/faster_rcnn_VOC2007_ZF_stage1_rpn/anchors.mat # >> anchors # anchors = # # -83 -39 100 56 # -175 -87 192 104 # -359 -183 376 200 # -55 -55 72 72 # -119 -119 136 136 # -247 -247 264 264 # -35 -79 52 96 # -79 -167 96 184 # -167 -343 184 360 #array([[ -83., -39., 100., 56.], # [-175., -87., 192., 104.], # [-359., -183., 376., 200.], # [ -55., -55., 72., 72.], # [-119., -119., 136., 136.], # [-247., -247., 264., 264.], # [ -35., -79., 52., 96.], # [ -79., -167., 96., 184.], # [-167., -343., 184., 360.]]) def generate_anchors(base_size=16, ratios=[0.5, 1, 2], scales=2**np.arange(3, 6)): """ Generate anchor (reference) windows by enumerating aspect ratios X scales wrt a reference (0, 0, 15, 15) window. """ #base_anchor的大小为(0,0,15,15),其他anchor在此基础上变换产生 base_anchor = np.array([1, 1, base_size, base_size]) - 1 #产生不同长宽比的anchor,面积一样,中心一样 ratio_anchors = _ratio_enum(base_anchor, ratios) anchors = np.vstack([_scale_enum(ratio_anchors[i, :], scales) for i in xrange(ratio_anchors.shape[0])]) return anchors def _whctrs(anchor): """ Return width, height, x center, and y center for an anchor (window). """ w = anchor[2] - anchor[0] + 1 h = anchor[3] - anchor[1] + 1 x_ctr = anchor[0] + 0.5 * (w - 1) y_ctr = anchor[1] + 0.5 * (h - 1) return w, h, x_ctr, y_ctr def _mkanchors(ws, hs, x_ctr, y_ctr): """ Given a vector of widths (ws) and heights (hs) around a center (x_ctr, y_ctr), output a set of anchors (windows). """ ws = ws[:, np.newaxis] hs = hs[:, np.newaxis] anchors = np.hstack((x_ctr - 0.5 * (ws - 1), y_ctr - 0.5 * (hs - 1), x_ctr + 0.5 * (ws - 1), y_ctr + 0.5 * (hs - 1))) return anchors def _ratio_enum(anchor, ratios): """ Enumerate a set of anchors for each aspect ratio wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) size = w * h size_ratios = size / ratios ws = np.round(np.sqrt(size_ratios)) hs = np.round(ws * ratios) anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors #产生不同面积大小的anchor,长宽比不变,长宽均变为原来的scale倍 def _scale_enum(anchor, scales): """ Enumerate a set of anchors for each scale wrt an anchor. """ w, h, x_ctr, y_ctr = _whctrs(anchor) ws = w * scales hs = h * scales anchors = _mkanchors(ws, hs, x_ctr, y_ctr) return anchors if __name__ == '__main__': import time t = time.time() a = generate_anchors() print time.time() - t print a from IPython import embed; embed()
models/pascal_voc/ZF/faster_rcnn_alt_opt/rpn_test.pt
#用RPN产生region proposal时的网络结构,这个网络只用前向计算 name: "ZF" input: "data" input_shape { dim: 1 dim: 3 dim: 224 dim: 224 } input: "im_info" input_shape { dim: 1 dim: 3 } #前面是ZF网,特征提取用,共享 # ------------------------ layer 1 ----------------------------- layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" convolution_param { num_output: 256#经过最后一层,产生256个特征图 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #-----------------------layer +------------------------- #RPN在conv5上滑动窗口,256*3*3*256卷积核,预测每个位置9个anchor是否属于前景, #如果属于前景,box的修正位置 layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors)#输出预测每个位置9个anchor,属于bg或fg kernel_size: 1 pad: 0 stride: 1 } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" convolution_param { num_output: 36 # 4 * 9(anchors)#输出预测9个anchor的修正坐标 kernel_size: 1 pad: 0 stride: 1 } } layer { bottom: "rpn_cls_score" top: "rpn_cls_score_reshape" name: "rpn_cls_score_reshape" type: "Reshape" reshape_param { shape { dim: 0 dim: 2 dim: -1 dim: 0 } } } #-----------------------output------------------------ layer { name: "rpn_cls_prob" type: "Softmax" bottom: "rpn_cls_score_reshape" top: "rpn_cls_prob" } layer { name: 'rpn_cls_prob_reshape' type: 'Reshape' bottom: 'rpn_cls_prob' top: 'rpn_cls_prob_reshape' reshape_param { shape { dim: 0 dim: 18 dim: -1 dim: 0 } } } layer { name: 'proposal' type: 'Python' bottom: 'rpn_cls_prob_reshape' bottom: 'rpn_bbox_pred' bottom: 'im_info' top: 'rois' top: 'scores' python_param { module: 'rpn.proposal_layer'#对应lib/rpn/proposal_layer.py layer: 'ProposalLayer' param_str: "'feat_stride': 16" } }
lib/rpn/proposal_layer.py,这一层用来由RPN产生region proposal
#coding:utf-8 # -------------------------------------------------------- # Faster R-CNN # Copyright (c) 2015 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Ross Girshick and Sean Bell # -------------------------------------------------------- import caffe import numpy as np import yaml from fast_rcnn.config import cfg from generate_anchors import generate_anchors from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes from fast_rcnn.nms_wrapper import nms DEBUG = False class ProposalLayer(caffe.Layer): """ Outputs object detection proposals by applying estimated bounding-box transformations to a set of regular boxes (called "anchors"). """ def setup(self, bottom, top): # parse the layer parameter string, which must be valid YAML layer_params = yaml.load(self.param_str_) #16,提取特征后的feature map的大小是原来的1/16 self._feat_stride = layer_params['feat_stride'] anchor_scales = layer_params.get('scales', (8, 16, 32)) #产生anchors self._anchors = generate_anchors(scales=np.array(anchor_scales)) self._num_anchors = self._anchors.shape[0] if DEBUG: print 'feat_stride: {}'.format(self._feat_stride) print 'anchors:' print self._anchors # rois blob: holds R regions of interest, each is a 5-tuple # (n, x1, y1, x2, y2) specifying an image batch index n and a # rectangle (x1, y1, x2, y2) top[0].reshape(1, 5) # scores blob: holds scores for R regions of interest if len(top) > 1: top[1].reshape(1, 1, 1, 1) #英文解释得很清楚 def forward(self, bottom, top): # Algorithm: # # for each (H, W) location i #1.generate A anchor boxes centered on cell i #2.apply predicted bbox deltas at cell i to each of the A anchors #3.clip predicted boxes to image #4.remove predicted boxes with either height or width < threshold #5.sort all (proposal, score) pairs by score from highest to lowest #6.take top pre_nms_topN proposals before NMS #7.apply NMS with threshold 0.7 to remaining proposals #8.take after_nms_topN proposals after NMS #9.return the top proposals (-> RoIs top, scores top) assert bottom[0].data.shape[0] == 1, \ 'Only single item batches are supported' cfg_key = str(self.phase) # either 'TRAIN' or 'TEST' pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N nms_thresh = cfg[cfg_key].RPN_NMS_THRESH min_size = cfg[cfg_key].RPN_MIN_SIZE # the first set of _num_anchors channels are bg probs # the second set are the fg probs, which we want scores = bottom[0].data[:, self._num_anchors:, :, :] bbox_deltas = bottom[1].data im_info = bottom[2].data[0, :] if DEBUG: print 'im_size: ({}, {})'.format(im_info[0], im_info[1]) print 'scale: {}'.format(im_info[2]) # 1. Generate proposals from bbox deltas and shifted anchors height, width = scores.shape[-2:] if DEBUG: print 'score map size: {}'.format(scores.shape) # Enumerate all shifts shift_x = np.arange(0, width) * self._feat_stride shift_y = np.arange(0, height) * self._feat_stride shift_x, shift_y = np.meshgrid(shift_x, shift_y) shifts = np.vstack((shift_x.ravel(), shift_y.ravel(), shift_x.ravel(), shift_y.ravel())).transpose() # Enumerate all shifted anchors: # # add A anchors (1, A, 4) to # cell K shifts (K, 1, 4) to get # shift anchors (K, A, 4) # reshape to (K*A, 4) shifted anchors A = self._num_anchors K = shifts.shape[0] anchors = self._anchors.reshape((1, A, 4)) + \ shifts.reshape((1, K, 4)).transpose((1, 0, 2)) anchors = anchors.reshape((K * A, 4)) # Transpose and reshape predicted bbox transformations to get them # into the same order as the anchors: # # bbox deltas will be (1, 4 * A, H, W) format # transpose to (1, H, W, 4 * A) # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a) # in slowest to fastest order bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4)) # Same story for the scores: # # scores are (1, A, H, W) format # transpose to (1, H, W, A) # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a) scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1)) # Convert anchors into proposals via bbox transformations proposals = bbox_transform_inv(anchors, bbox_deltas) # 2. clip predicted boxes to image proposals = clip_boxes(proposals, im_info[:2]) #过滤掉width或height小于RPN_MIN_SIZE的proposal # 3. remove predicted boxes with either height or width < threshold # (NOTE: convert min_size to input image scale stored in im_info[2]) keep = _filter_boxes(proposals, min_size * im_info[2]) proposals = proposals[keep, :] scores = scores[keep] # 4. sort all (proposal, score) pairs by score from highest to lowest # 5. take top pre_nms_topN (e.g. 6000) order = scores.ravel().argsort()[::-1] if pre_nms_topN > 0: order = order[:pre_nms_topN] proposals = proposals[order, :] scores = scores[order] # 6. apply nms (e.g. threshold = 0.7) # 7. take after_nms_topN (e.g. 300) # 8. return the top proposals (-> RoIs top) keep = nms(np.hstack((proposals, scores)), nms_thresh) if post_nms_topN > 0: keep = keep[:post_nms_topN] proposals = proposals[keep, :] scores = scores[keep] # Output rois blob # Our RPN implementation only supports a single input image, so all # batch inds are 0 batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32) blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False))) top[0].reshape(*(blob.shape)) top[0].data[...] = blob # [Optional] output scores blob if len(top) > 1: top[1].reshape(*(scores.shape)) top[1].data[...] = scores def backward(self, top, propagate_down, bottom): """This layer does not propagate gradients.""" pass def reshape(self, bottom, top): """Reshaping happens during the call to forward.""" pass def _filter_boxes(boxes, min_size): """Remove all boxes with any side smaller than min_size.""" ws = boxes[:, 2] - boxes[:, 0] + 1 hs = boxes[:, 3] - boxes[:, 1] + 1 keep = np.where((ws >= min_size) & (hs >= min_size))[0] return keep
models/pascal_voc/ZF/faster_rcnn_alt_opt/fast.pt
#stage 1训练fast rcnn网络,输入是rpn提取的roi以及gt box name: "ZF" layer { name: 'data' type: 'Python' top: 'data' top: 'rois' top: 'labels' top: 'bbox_targets' top: 'bbox_inside_weights' top: 'bbox_outside_weights' python_param { module: 'roi_data_layer.layer'#对应lib/roi_data_layer/layer.py #为训练fast rcnn时为网络输入roi,此时为roi是region proposal layer: 'RoIDataLayer' param_str: "'num_classes': 21" } } #ZF网,特征提取用,共享 #========= conv1-conv5 ============ layer { name: "conv1" type: "Convolution" bottom: "data" top: "conv1" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 96 kernel_size: 7 pad: 3 stride: 2 } } layer { name: "relu1" type: "ReLU" bottom: "conv1" top: "conv1" } layer { name: "norm1" type: "LRN" bottom: "conv1" top: "norm1" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool1" type: "Pooling" bottom: "norm1" top: "pool1" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv2" type: "Convolution" bottom: "pool1" top: "conv2" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 5 pad: 2 stride: 2 } } layer { name: "relu2" type: "ReLU" bottom: "conv2" top: "conv2" } layer { name: "norm2" type: "LRN" bottom: "conv2" top: "norm2" lrn_param { local_size: 3 alpha: 0.00005 beta: 0.75 norm_region: WITHIN_CHANNEL engine: CAFFE } } layer { name: "pool2" type: "Pooling" bottom: "norm2" top: "pool2" pooling_param { kernel_size: 3 stride: 2 pad: 1 pool: MAX } } layer { name: "conv3" type: "Convolution" bottom: "pool2" top: "conv3" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu3" type: "ReLU" bottom: "conv3" top: "conv3" } layer { name: "conv4" type: "Convolution" bottom: "conv3" top: "conv4" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 384 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu4" type: "ReLU" bottom: "conv4" top: "conv4" } layer { name: "conv5" type: "Convolution" bottom: "conv4" top: "conv5" param { lr_mult: 1.0 } param { lr_mult: 2.0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 } } layer { name: "relu5" type: "ReLU" bottom: "conv5" top: "conv5" } #========= RCNN ============ layer { name: "roi_pool_conv5" type: "ROIPooling"#这个层在caffe-fast-rcnn里实现 bottom: "conv5" bottom: "rois" top: "roi_pool_conv5" roi_pooling_param {#每个roi做max pooling后的大小为6*6 pooled_w: 6 pooled_h: 6 spatial_scale: 0.0625 # 1/16 } } layer { name: "fc6" type: "InnerProduct" bottom: "roi_pool_conv5" top: "fc6" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu6" type: "ReLU" bottom: "fc6" top: "fc6" } layer { name: "drop6" type: "Dropout" bottom: "fc6" top: "fc6" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "fc7" type: "InnerProduct" bottom: "fc6" top: "fc7" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 4096 } } layer { name: "relu7" type: "ReLU" bottom: "fc7" top: "fc7" } layer { name: "drop7" type: "Dropout" bottom: "fc7" top: "fc7" dropout_param { dropout_ratio: 0.5 scale_train: false } } layer { name: "cls_score" type: "InnerProduct" bottom: "fc7" top: "cls_score" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 21 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "bbox_pred" type: "InnerProduct" bottom: "fc7" top: "bbox_pred" param { lr_mult: 1.0 } param { lr_mult: 2.0 } inner_product_param { num_output: 84 weight_filler { type: "gaussian" std: 0.001 } bias_filler { type: "constant" value: 0 } } } layer { name: "loss_cls" type: "SoftmaxWithLoss" bottom: "cls_score" bottom: "labels" propagate_down: 1 propagate_down: 0 top: "cls_loss" loss_weight: 1 loss_param { ignore_label: -1 normalize: true } } layer { name: "loss_bbox" type: "SmoothL1Loss" bottom: "bbox_pred" bottom: "bbox_targets" bottom: "bbox_inside_weights" bottom: "bbox_outside_weights" top: "bbox_loss" loss_weight: 1 } #========= RPN ============ # Dummy layers so that initial parameters are saved into the output net layer { name: "rpn_conv1" type: "Convolution" bottom: "conv5" top: "rpn_conv1" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 256 kernel_size: 3 pad: 1 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_relu1" type: "ReLU" bottom: "rpn_conv1" top: "rpn_conv1" } layer { name: "rpn_cls_score" type: "Convolution" bottom: "rpn_conv1" top: "rpn_cls_score" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 18 # 2(bg/fg) * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "rpn_bbox_pred" type: "Convolution" bottom: "rpn_conv1" top: "rpn_bbox_pred" param { lr_mult: 0 decay_mult: 0 } param { lr_mult: 0 decay_mult: 0 } convolution_param { num_output: 36 # 4 * 9(anchors) kernel_size: 1 pad: 0 stride: 1 weight_filler { type: "gaussian" std: 0.01 } bias_filler { type: "constant" value: 0 } } } layer { name: "silence_rpn_cls_score" type: "Silence" bottom: "rpn_cls_score" } layer { name: "silence_rpn_bbox_pred" type: "Silence" bottom: "rpn_bbox_pred" } 其中roi pooling layer在 caffe/src/layers/roi_pooling_layer.cpp里实现 // ------------------------------------------------------------------ // Fast R-CNN // Copyright (c) 2015 Microsoft // Licensed under The MIT License [see fast-rcnn/LICENSE for details] // Written by Ross Girshick // ------------------------------------------------------------------ #include <cfloat> #include "caffe/fast_rcnn_layers.hpp" using std::max; using std::min; using std::floor; using std::ceil; namespace caffe { template <typename Dtype> void ROIPoolingLayer<Dtype>::LayerSetUp(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { ROIPoolingParameter roi_pool_param = this->layer_param_.roi_pooling_param(); CHECK_GT(roi_pool_param.pooled_h(), 0) << "pooled_h must be > 0"; CHECK_GT(roi_pool_param.pooled_w(), 0) << "pooled_w must be > 0"; pooled_height_ = roi_pool_param.pooled_h(); pooled_width_ = roi_pool_param.pooled_w(); spatial_scale_ = roi_pool_param.spatial_scale(); LOG(INFO) << "Spatial scale: " << spatial_scale_; } template <typename Dtype> void ROIPoolingLayer<Dtype>::Reshape(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { channels_ = bottom[0]->channels(); height_ = bottom[0]->height(); width_ = bottom[0]->width(); top[0]->Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); max_idx_.Reshape(bottom[1]->num(), channels_, pooled_height_, pooled_width_); } template <typename Dtype> void ROIPoolingLayer<Dtype>::Forward_cpu(const vector<Blob<Dtype>*>& bottom, const vector<Blob<Dtype>*>& top) { const Dtype* bottom_data = bottom[0]->cpu_data(); const Dtype* bottom_rois = bottom[1]->cpu_data(); // Number of ROIs int num_rois = bottom[1]->num(); int batch_size = bottom[0]->num(); int top_count = top[0]->count(); Dtype* top_data = top[0]->mutable_cpu_data(); caffe_set(top_count, Dtype(-FLT_MAX), top_data); int* argmax_data = max_idx_.mutable_cpu_data(); caffe_set(top_count, -1, argmax_data); // For each ROI R = [batch_index x1 y1 x2 y2]: max pool over R for (int n = 0; n < num_rois; ++n) { int roi_batch_ind = bottom_rois[0]; int roi_start_w = round(bottom_rois[1] * spatial_scale_); int roi_start_h = round(bottom_rois[2] * spatial_scale_); int roi_end_w = round(bottom_rois[3] * spatial_scale_); int roi_end_h = round(bottom_rois[4] * spatial_scale_); CHECK_GE(roi_batch_ind, 0); CHECK_LT(roi_batch_ind, batch_size); int roi_height = max(roi_end_h - roi_start_h + 1, 1); int roi_width = max(roi_end_w - roi_start_w + 1, 1); const Dtype bin_size_h = static_cast<Dtype>(roi_height) / static_cast<Dtype>(pooled_height_); const Dtype bin_size_w = static_cast<Dtype>(roi_width) / static_cast<Dtype>(pooled_width_); const Dtype* batch_data = bottom_data + bottom[0]->offset(roi_batch_ind); for (int c = 0; c < channels_; ++c) { for (int ph = 0; ph < pooled_height_; ++ph) { for (int pw = 0; pw < pooled_width_; ++pw) { // Compute pooling region for this output unit: // start (included) = floor(ph * roi_height / pooled_height_) // end (excluded) = ceil((ph + 1) * roi_height / pooled_height_) int hstart = static_cast<int>(floor(static_cast<Dtype>(ph) * bin_size_h)); int wstart = static_cast<int>(floor(static_cast<Dtype>(pw) * bin_size_w)); int hend = static_cast<int>(ceil(static_cast<Dtype>(ph + 1) * bin_size_h)); int wend = static_cast<int>(ceil(static_cast<Dtype>(pw + 1) * bin_size_w)); hstart = min(max(hstart + roi_start_h, 0), height_); hend = min(max(hend + roi_start_h, 0), height_); wstart = min(max(wstart + roi_start_w, 0), width_); wend = min(max(wend + roi_start_w, 0), width_); bool is_empty = (hend <= hstart) || (wend <= wstart); const int pool_index = ph * pooled_width_ + pw; if (is_empty) { top_data[pool_index] = 0; argmax_data[pool_index] = -1; } for (int h = hstart; h < hend; ++h) { for (int w = wstart; w < wend; ++w) { const int index = h * width_ + w; if (batch_data[index] > top_data[pool_index]) { top_data[pool_index] = batch_data[index]; argmax_data[pool_index] = index; } } } } } // Increment all data pointers by one channel batch_data += bottom[0]->offset(0, 1); top_data += top[0]->offset(0, 1); argmax_data += max_idx_.offset(0, 1); } // Increment ROI data pointer bottom_rois += bottom[1]->offset(1); } } template <typename Dtype> void ROIPoolingLayer<Dtype>::Backward_cpu(const vector<Blob<Dtype>*>& top, const vector<bool>& propagate_down, const vector<Blob<Dtype>*>& bottom) { NOT_IMPLEMENTED; } #ifdef CPU_ONLY STUB_GPU(ROIPoolingLayer); #endif INSTANTIATE_CLASS(ROIPoolingLayer); REGISTER_LAYER_CLASS(ROIPooling); } // namespace caffe
