Faster R-CNN源码阅读之五：Faster R-CNN/lib/rpn_msr/proposal_layer_tf.py

马宜民
2023-12-01
一、介绍
本demo由Faster R-CNN官方提供，我只是在官方的代码上增加了注释，一方面方便我自己学习，另一方面贴出来和大家一起交流。
该文件中的函数的主要目的是通过将估计的边界框变换应用于一组常规框（称为“anchors”）来输出目标检测proposals。
二、代码以及注释
# -*- coding:utf-8 -*-
# --------------------------------------------------------
# Faster R-CNN
# Copyright (c) 2015 Microsoft
# Licensed under The MIT License [see LICENSE for details]
# Written by Ross Girshick and Sean Bell
# --------------------------------------------------------

import numpy as np
import yaml
from fast_rcnn.config import cfg
from generate_anchors import generate_anchors
from fast_rcnn.bbox_transform import bbox_transform_inv, clip_boxes
from fast_rcnn.nms_wrapper import nms
import pdb

DEBUG = False
"""
Outputs object detection proposals by applying estimated bounding-box
transformations to a set of regular boxes (called "anchors").
"""


def proposal_layer(rpn_cls_prob_reshape, rpn_bbox_pred, im_info, cfg_key, _feat_stride=[16, ],
                   anchor_scales=[8, 16, 32]):
    # Algorithm:
    #
    # for each (H, W) location i
    #   generate A anchor boxes centered on cell i
    #   apply predicted bbox deltas at cell i to each of the A anchors
    # clip predicted boxes to image
    # remove predicted boxes with either height or width < threshold
    # sort all (proposal, score) pairs by score from highest to lowest
    # take top pre_nms_topN proposals before NMS
    # apply NMS with threshold 0.7 to remaining proposals
    # take after_nms_topN proposals after NMS
    # return the top proposals (-> RoIs top, scores top)
    # layer_params = yaml.load(self.param_str_)

    # 算法
    # 对(H, W)的每个位置i：
    #    在i的位置生成A个anchor boxes
    #    对位置i的每个anchor使用预测的bboxdeltas
    # 裁剪预测框
    # 移除宽度或者高度小于某一阈值的预测框
    # 按从最高到最低的分数对所有（提案，分数）对进行排序
    # 在NMS之前获取top pre_nms_topN个proposals
    # 对其余proposals应用阈值为0.7的NMS
    # NMS之后取出after_nms_topN个proposals
    # 取出最前面的proposals  (-> RoIs top, scores top)

    # 获取若干anchors，这些anchors并没有加上偏移量，shape：[n, 4](此处n为anchors的个数)
    _anchors = generate_anchors(scales=np.array(anchor_scales))

    # 获取前面生成anchors的个数
    _num_anchors = _anchors.shape[0]

    # [N, H, W, C]通道顺序更改为[N, C, H, W]
    rpn_cls_prob_reshape = np.transpose(rpn_cls_prob_reshape, [0, 3, 1, 2])
    rpn_bbox_pred = np.transpose(rpn_bbox_pred, [0, 3, 1, 2])
    # rpn_cls_prob_reshape = np.transpose(np.reshape(rpn_cls_prob_reshape,[1,rpn_cls_prob_reshape.shape[0],rpn_cls_prob_reshape.shape[1],rpn_cls_prob_reshape.shape[2]]),[0,3,2,1])
    # rpn_bbox_pred = np.transpose(rpn_bbox_pred,[0,3,2,1])

    # 获取第一张图片的尺寸等信息
    im_info = im_info[0]

    # batch size必须为1，因为每次只能feed一张图片进入网络
    assert rpn_cls_prob_reshape.shape[0] == 1, \
        'Only single item batches are supported'
    # cfg_key = str(self.phase) # either 'TRAIN' or 'TEST'
    # cfg_key = 'TEST'

    # 以下代码获取配置信息， 包括nms的信息
    pre_nms_topN = cfg[cfg_key].RPN_PRE_NMS_TOP_N
    post_nms_topN = cfg[cfg_key].RPN_POST_NMS_TOP_N
    nms_thresh = cfg[cfg_key].RPN_NMS_THRESH
    min_size = cfg[cfg_key].RPN_MIN_SIZE

    # the first set of _num_anchors channels are bg probs
    # the second set are the fg probs, which we want
    # 按照channel通道取出RPN预测的框属于前景的分数
    # 请注意，在18个channel中，前9个是框属于背景的概率，后9个才是属于前景的概率（即是一个物体目标的概率）
    scores = rpn_cls_prob_reshape[:, _num_anchors:, :, :]

    # 重新取个变量名，bbox_deltas代表了RPN网络输出的各个框的变换信息
    bbox_deltas = rpn_bbox_pred
    # im_info = bottom[2].data[0, :]

    # 调试时的信息输出，下同
    if DEBUG:
        print 'im_size: ({}, {})'.format(im_info[0], im_info[1])
        print 'scale: {}'.format(im_info[2])

    # 1. Generate proposals from bbox deltas and shifted anchors
    # 1. 从bbox deltas和平移之后的anchors上生成proposals

    # 由于之前将通道顺序改为了[N, C, H, W]，因此直接获取最后的两个值对应于height和width
    height, width = scores.shape[-2:]

    if DEBUG:
        print 'score map size: {}'.format(scores.shape)

    # Enumerate all shifts
    # 枚举所有的平移量
    # 产生所有的x的集合
    shift_x = np.arange(0, width) * _feat_stride
    # 产生所有的y的集合
    shift_y = np.arange(0, height) * _feat_stride
    # 生成网格
    shift_x, shift_y = np.meshgrid(shift_x, shift_y)
    # 产生所有偏移量
    shifts = np.vstack((shift_x.ravel(), shift_y.ravel(),
                        shift_x.ravel(), shift_y.ravel())).transpose()

    # Enumerate all shifted anchors:
    #
    # add A anchors (1, A, 4) to
    # cell K shifts (K, 1, 4) to get
    # shift anchors (K, A, 4)
    # reshape to (K*A, 4) shifted anchors
    # 枚举所有加上偏移量之后的anchors的坐标
    # 因为生成的anchors包含左上角和右下角的两个坐标（相对于坐标原点0）
    # 因此上述的所有偏移量第三列和第一列相同，第四列和第二列相同
    # 加上anchors之后就获得了所有的偏移之后的初始proposals
    A = _num_anchors
    K = shifts.shape[0]
    anchors = _anchors.reshape((1, A, 4)) + \
              shifts.reshape((1, K, 4)).transpose((1, 0, 2))
    anchors = anchors.reshape((K * A, 4))

    # Transpose and reshape predicted bbox transformations to get them
    # into the same order as the anchors:
    #
    # bbox deltas will be (1, 4 * A, H, W) format
    # transpose to (1, H, W, 4 * A)
    # reshape to (1 * H * W * A, 4) where rows are ordered by (h, w, a)
    # in slowest to fastest order
    #
    # （上面的a表示通道数目，也常常使用c表示通道数目）
    # 转置并重新整理（reshape）预测的bbox变换，以便将其和anchors保持统一顺序
    bbox_deltas = bbox_deltas.transpose((0, 2, 3, 1)).reshape((-1, 4))

    # Same story for the scores:
    #
    # scores are (1, A, H, W) format
    # transpose to (1, H, W, A)
    # reshape to (1 * H * W * A, 1) where rows are ordered by (h, w, a)
    # 和上面的bbox deltas同理
    scores = scores.transpose((0, 2, 3, 1)).reshape((-1, 1))

    # Convert anchors into proposals via bbox transformations
    # 在这里结合RPN的输出变换初始框的坐标，得到第一次变换坐标后的proposals
    proposals = bbox_transform_inv(anchors, bbox_deltas)

    # 2. clip predicted boxes to image
    # 2. 在这里将超出图像边界的proposals进行边界裁剪，使之在图像边界之内
    proposals = clip_boxes(proposals, im_info[:2])

    # 3. remove predicted boxes with either height or width < threshold
    # (NOTE: convert min_size to input image scale stored in im_info[2])
    # 3. 去除那些宽度或者高度小于一定阈值的预测框，并返回符合条件的预测框的索引
    # （注：将min_size转换为存储在im_info [2]中的输入图像比例）
    keep = _filter_boxes(proposals, min_size * im_info[2])

    # 保留符合条件的proposals和scores
    proposals = proposals[keep, :]
    scores = scores[keep]

    # 4. sort all (proposal, score) pairs by score from highest to lowest
    # 4. 按照score从大到小的顺序给（proposals，score）对进行排序
    # 5. take top pre_nms_topN (e.g. 6000)
    # 5. 取出score最高的pre_nms_topN个（proposals，score）对
    order = scores.ravel().argsort()[::-1]
    if pre_nms_topN > 0:
        order = order[:pre_nms_topN]
    proposals = proposals[order, :]
    scores = scores[order]

    # 6. apply nms (e.g. threshold = 0.7)
    # 6. 对（proposals，score）对应用nms
    # 7. take after_nms_topN (e.g. 300)
    # 7. 取出后after_nms_topN个（proposals，score）对
    # 8. return the top proposals (-> RoIs top)
    # 8. 返回最上面的N个proposals
    keep = nms(np.hstack((proposals, scores)), nms_thresh)
    if post_nms_topN > 0:
        keep = keep[:post_nms_topN]
    proposals = proposals[keep, :]
    scores = scores[keep]

    # Output rois blob
    # Our RPN implementation only supports a single input image, so all
    # batch inds are 0
    # 返回rois blob。由于batch size为1，因此所有的batch inds均为0

    # batch inds，全为0的仅为1列的numpy数组
    batch_inds = np.zeros((proposals.shape[0], 1), dtype=np.float32)

    # 横向连接batch inds和proposals
    blob = np.hstack((batch_inds, proposals.astype(np.float32, copy=False)))

    # 返回
    return blob
    # top[0].reshape(*(blob.shape))
    # top[0].data[...] = blob

    # [Optional] output scores blob
    # if len(top) > 1:
    #    top[1].reshape(*(scores.shape))
    #    top[1].data[...] = scores


def _filter_boxes(boxes, min_size):
    """Remove all boxes with any side smaller than min_size."""
    # 去除高度或者宽度小于某一特定值的boxes

    # 计算高度和宽度
    ws = boxes[:, 2] - boxes[:, 0] + 1
    hs = boxes[:, 3] - boxes[:, 1] + 1

    # 计算高度和宽度均高于某一值的boxes的索引
    keep = np.where((ws >= min_size) & (hs >= min_size))[0]

    # 返回索引数组
    return keep
Faster R-CNN源码阅读之五：Faster R-CNN/lib/rpn_msr/proposal_layer_tf.py

相关阅读

相关文章

相关问答

相关文档