复现Retinaface-人脸检测模型

凌经赋

2023-12-01

Retinaface提出的动机

Insight Face在2019年提出的最新人脸检测模型，原模型使用了deformable convolution和dense regression loss，在 WiderFace 数据集上达到SOTA。

截止2019年8月，原始模型尚未全部开源，目前开源的简化版是基于传统物体检测网络RetinaNet的改进版，添加了SSH网络的检测模块，提升检测精度。

作者提供了三种基础网络，基于ResNet的ResNet50和ResNet152版本能提供更好的精度，以及基于mobilenet（0.25）的轻量版本mnet，检测速度更快。

MobileNet025网络结构

from tensorflow.keras.layers import DepthwiseConv2D, Conv2D, Activation, BatchNormalization, Input, Dense, Flatten
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
import tensorflow as tf

# 卷积块
def _conv_block(inputs, filters, kernel=(3, 3), strides=(1, 1)):
    x = Conv2D(filters, kernel, padding='same',
               strides=strides, use_bias=False, name='conv01')(inputs)
    print(type(x))
    x = BatchNormalization(name='conv01_bn')(x)
    return Activation(relu6, name='conv01_relu6')(x)

# relu6
def relu6(x):
    relu = K.relu(x, max_value=6)
    return relu

# 深度可卷积块
def _depthwise_conv_block(inputs, pointwise_conv_filters,
                          depth_multiplier=1, strides=(1, 1), block_id=1):
    x = DepthwiseConv2D((3, 3), padding='same', depth_multiplier=depth_multiplier, strides=strides, use_bias=False,
                        name='conv_dw_%d' % block_id)(inputs)
    x = BatchNormalization(name='conv_dw_%d_bn' % block_id)(x)
    x = Activation(relu6, name='conv_dw_%d_relu' % block_id)(x)
    x = Conv2D(pointwise_conv_filters, (1, 1), padding='same', strides=(1, 1), name='conv_pw_%d' % block_id)(x)
    x = BatchNormalization(name='conv_pw_%d_bn' % block_id)(x)
    return Activation(relu6, name='conv_pw_%d_relu6' % block_id)(x)

# MobileNet主干网络
def MobileNet(img_tensor, depth_multiplier=1):
    # 640,640,3 ->320,320,8
    x = _conv_block(inputs=img_tensor, filters=8, strides=(2, 2))

    # 320,320,16
    x = _depthwise_conv_block(x, 16, depth_multiplier=depth_multiplier, block_id=1)
    # 320,320,16->160,160,32
    x = _depthwise_conv_block(x, 32, depth_multiplier=depth_multiplier, strides=(2, 2), block_id=2)
    x = _depthwise_conv_block(x, 32, depth_multiplier=depth_multiplier, block_id=3)
    # 160,160,32->80,80,64
    x = _depthwise_conv_block(x, 64, strides=(2, 2), depth_multiplier=depth_multiplier, block_id=4)
    x = _depthwise_conv_block(x, 64, depth_multiplier, block_id=5)
    feat1 = x
    # 80,80,64->40,40,128
    x = _depthwise_conv_block(x, 128, depth_multiplier=depth_multiplier, strides=(2, 2), block_id=6)
    x = _depthwise_conv_block(x, 128, depth_multiplier, block_id=7)
    x = _depthwise_conv_block(x, 128, depth_multiplier, block_id=8)
    x = _depthwise_conv_block(x, 128, depth_multiplier, block_id=9)
    x = _depthwise_conv_block(x, 128, depth_multiplier, block_id=10)
    x = _depthwise_conv_block(x, 128, depth_multiplier, block_id=11)
    feat2 = x
    # 40,40,128->20,20,256
    x = _depthwise_conv_block(x, 256, depth_multiplier=depth_multiplier, strides=(2, 2), block_id=12)
    x = _depthwise_conv_block(x, 256, depth_multiplier, block_id=13)
    feat3 = x
    return feat1, feat2, feat3

Retinaface网络结构

from .mobilenet025 import MobileNet
import tensorflow.keras.backend as K
from tensorflow.keras.layers import Add, Concatenate, Conv2D, Activation, \
    BatchNormalization, LeakyReLU, Reshape, Input
from tensorflow.keras.models import Model
from utils.utils import compose
from .layers import UpsampleLike

#Retinaface网络
def Retinaface(cfg, backbone='mobilenet'):
    inputs = Input(shape=(None, None, 3))

    if backbone == 'mobilenet':
        C3, C4, C5 = MobileNet(inputs)
    else:
        raise ValueError(f'Unsupported backbone{backbone},Please use mobilenet backbone')

    leaky = 0
    if cfg['out_channel'] <= 64:
        leaky = 0.1
    # 利用1*1的卷积核对提取的有效特征层的通道数进行调整
    P3 = Conv2D_BN_Leaky(cfg['out_channel'], kernel_size=1, padding='same', stride=1, leaky=leaky, name='P3')(C3)
    P4 = Conv2D_BN_Leaky(cfg['out_channel'], kernel_size=1, padding='same', stride=1, leaky=leaky, name='P4')(C4)
    P5 = Conv2D_BN_Leaky(cfg['out_channel'], kernel_size=1, padding='same', stride=1, leaky=leaky, name='P5')(C5)
    # 利用上采样将P5的大小调整成P4的大小，之后将采样后的P5和P4进行特征融合
    P5_Up = UpsampleLike()([P5, P4])
    P4_ADD = Add()([P5_Up, P4])
    # 对特征融合后的P4利用3*3的卷积核进行卷积
    P4_ADD_Conv2D = Conv2D_BN_Leaky(cfg['out_channel'], kernel_size=3, padding='same', stride=1, leaky=leaky,
                                    name='P4_ADD_Conv2D')(P4_ADD)
    # 利用上采样将P4的大小调整成P5的大小，之后将采样后的P4和P3进行特征融合
    P4_Up = UpsampleLike([P4_ADD_Conv2D, P3])
    P3_ADD = Add()([P4_Up, P3])
    # 对特征融合后的P3利用3*3的卷积核进行卷积
    P3_ADD_Conv2D = Conv2D_BN_Leaky(cfg['out_channel'], kernel_size=3, padding='same', stride=1, leaky=leaky,
                                    name='P3_ADD_Conv2D')(P3_ADD)
	ssh_1 = SSH(P3_ADD_Conv2D, cfg['out_channel'], leaky=leaky)
    ssh_2 = SSH(P4_ADD_Conv2D, cfg['out_channel'], leaky=leaky)
    ssh_3 = SSH(P5, cfg['out_channel'], leaky=leaky)
    ssh_all = [ssh_1, ssh_2, ssh_3]

    # 将结果进行堆叠
    bbox_predict = Concatenate(name='bbox_head', axis=1)([BboxHead(bbox) for bbox in ssh_all])
    class_predict = Concatenate(name='class_head', axis=1)([ClassHead(cls) for cls in ssh_all])
    land_predict = Concatenate(name='land_head', axis=1)([LandMarkHead(land) for land in ssh_all])

    outputs = [bbox_predict, class_predict, land_predict]
    model = Model(inputs=inputs, outputs=outputs)

    return model

cfg结构

cfg_mnet = {
    'out_channel': 64,
    'name': 'mobilenet0.25',
    # 每个有效特征层对应两个先验框,[16,32]是最大有效特征层C3对应的尺寸
    # 依次是C4,C5对应的尺寸，尺寸越小的有效特征层卷积的次数越多，包含的语意信息就越多
    # 由于卷积次数增多后，小目标的特征容易消失，所以C5适合检测大目标，同理C3检测小目标的效果会更好
    'min_sizes': [[16, 32], [64, 128], [256, 512]],
    # 每个有效特征层被压缩的倍数,2^3,2^4,2^5,8,16,32分别表示被压缩3次，被压缩4次和被压缩5次
    'steps': [8, 16, 32],
    # 表示是否将先验框固定到0和1之间
    'clip': False,
    # 这里使用训练图片大小为832，是为了提高大图状态下的困难样本的检测能力
    'train_image_size': 832,
    # 正则权值
    'loc_weights': 2.0,
}

先验框代码实现

import numpy as np
from itertools import product


class Anchors(object):
    def __init__(self, cfg, image_size=None):
        super(Anchors, self).__init__()
        # 不同的有效特征层对应先验框的尺寸
        self.min_sizes = cfg['min_sizes']
        # 每个有效特征层被压缩的倍数
        self.steps = cfg['steps']
        # 是否将先验框调整到0-1之间
        self.clip = cfg['clip']
        # 图片的尺寸
        self.image_size = image_size
        # 三个有效特征层的高和宽
        self.feature_maps = [[np.ceil(self.image_size[0] / step)
                                 , np.ceil(self.image_size[1] / step)] for step in self.steps]

    # 生成先验框
    def get_anchors(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            # 取出有效特征层对应的先验框尺寸
            min_sizes = self.min_sizes[k]
            # 在每个有效特征层上生成先验框
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    # 生成先验框的宽和高
                    s_w = min_size / self.image_size[1]
                    s_h = min_size / self.image_size[0]
                    # 生成先验框的中心点坐标，j+0.5,i+0.5目的是使x,y不为0
                    dense_x = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
                    dense_y = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
                    for centre_x, centre_y in product(dense_x, dense_y):
                        anchors += [centre_x, centre_y, s_w, s_h]
        # 对先验框进行整理
        anchors = np.reshape(anchors, [-1, 4])
        output = np.zeros_like(anchors)
        # 将先验框的形式转换成左上角右下角的形式
        output[:, 0] = anchors[:, 0] - anchors[:, 2] / 2
        output[:, 1] = anchors[:, 1] - anchors[:, 3] / 2
        output[:, 2] = anchors[:, 0] - anchors[:, 2] / 2
        output[:, 3] = anchors[:, 1] - anchors[:, 3] / 2

        if self.clip:
            output = np.clip(output, 0, 1)
        return output

Conv2D+BatchNormalization+LeakyReLU

# Conv2D+BN+Leaky
def Conv2D_BN_Leaky(*args, **kwargs):
    leaky = 0.1
    # try是为了判断有没有传leaky这个参数
    try:
        leaky = kwargs['leaky']
        del kwargs['leaky']
    except:
        pass
    return compose(Conv2D(*args, **kwargs),
                   BatchNormalization(),
                   LeakyReLU(alpha=leaky))

Conv2D+BatchNormalization

# Conv2D+BN
def Conv2D_BN(*args, **kwargs):
    return compose(Conv2D(*args, **kwargs),
                   BatchNormalization())

SSH网络结构

# 多尺度加强感受野
def SSH(input_tensor, out_channel, leaky=0.1):
    # 3*3卷积核
    Conv_3 = Conv2D_BN(out_channel // 2, kernel_size=3, stride=1, padding='same')(input_tensor)
    # 2个3*3大小的卷积核代替1个5*5，目的减少参数量
    Conv_5_2 = Conv2D_BN_Leaky(out_channel // 4, kernel_size=3, stride=1, padding='same', leaky=leaky)(input_tensor)
    Conv_5 = Conv2D_BN(out_channel // 4, kernel_size=3, stride=1, padding='same')(Conv_5_2)
    # 3个3*3大小的卷积核代替1个7*7
    Conv_7_3 = Conv2D_BN_Leaky(out_channel // 4, kernel_size=3, stride=1, padding='same', leaky=leaky)(Conv_5)
    Conv_7 = Conv2D_BN(out_channel // 4, kernel_size=3, stride=1, padding='same')(Conv_7_3)

    # 将所有结果堆叠起来
    output = Concatenate()([Conv_3, Conv_5, Conv_7])

    return Activation('relu')(output)

ClassHead检测人脸

# 检测先验框内是否包含人脸（分类）,分类预测结果用于判断先验框内部是否包含人脸
# num_anchors*2表示每个先验框内部包含人脸的概率
def ClassHead(inputs, num_anchors=2):
    # 利用1*1卷积核调整通道数
    outputs = Conv2D(num_anchors * 2, kernel_size=1, strides=1)(inputs)
    return Activation('softmax')(Reshape([-1, 2])(outputs))

BboxHead调整先验框获得预测框

# 框的回归预测目的是对先验框进行调整获得预测框
def BboxHead(inputs, num_anchors=2):
    outputs = Conv2D(num_anchors * 4, kernel_size=1, strides=1)(inputs)
    return Reshape([-1, 4])(outputs)

LandMarkHead调整人脸关键点

# 人脸关键点的回归预测结果用于对先验框进行调整获得人脸关键点
# 每一个人脸关键点需要两个参数，一个五个人脸关键点
def LandMarkHead(inputs, num_anchors=2):
    outputs = Conv2D(num_anchors * 10, kernel_size=1, strides=1)(inputs)
    return Reshape([-1, 10])(outputs)

上采样

from tensorflow.keras import backend as K
from tensorflow.keras.layers import Layer
import tensorflow as tf


class UpsampleLike(Layer):
    def call(self, inputs, **kwargs):
        source, target = inputs
        target_shape = K.shape(target)
        if K.image_data_format() == 'channels_last':
            return tf.compat.v1.image.resize_images(source, (target_shape[1], target_shape[2])
                                                    , method=tf.image.ResizeMethod.NEAREST_NEIGHBOR)

utils代码

import cv2
from functools import reduce
import numpy as np
import tensorflow as tf
from tensorflow.keras import backend as K
import warnings


def compose(*funcs):
    if funcs:
        # reduce做累积运算，f,g是传进来的函数,a,kw是传进来的函数的参数
        # 比如f是Conv2D做完卷积运算后，g再拿到卷积的结果也就是一个tensor（*a,**kw不影响结果仅限lambda，手动给函数传两个参数）
        # 传给g，如此往复,直到所有参数都被累积完成.
        #写成函数形式:
        #def result(f, g):
    		#def concat(*args, **kwargs):
        		#return g(f(*args, **kwargs))

    		#return concat
    	#reduce(result,funcs)

        return reduce(lambda f, g: lambda *a, **kw: g(f(*a, **kw)), funcs)
    else:
        raise ValueError('Composition of empty sequence not supported')


class BBoxUtility(object):
    def __init__(self, priors=None, overlap_threshold=0.35, nms_thresh=0.45):
        self.priors = priors
        self.num_priors = 0 if priors is None else len(priors)
        self.overlap_threshold = overlap_threshold
        self._nms_thresh = nms_thresh

    def iou(self, box):
        inter_upleft = np.maximum(self.priors[:, :2], box[:2])


class ModelCheckpoint(tf.keras.callbacks.Callback):
    def __init__(self, filepath, monitor='val_loss', verbose=0,
                 save_best_only=False, save_weights_only=False,
                 mode='auto', period=1):
        super(ModelCheckpoint, self).__init__()
        self.monitor = monitor
        self.verbose = verbose
        self.filepath = filepath
        self.save_best_only = save_best_only
        self.save_weights_only = save_weights_only
        self.period = period
        self.epochs_since_last_save = 0

        if mode not in ['auto', 'min', 'max']:
            warnings.warn('ModelCheckpoint mode %s is unknown, '
                          'fallback to auto mode.' % (mode),
                          RuntimeWarning)
            mode = 'auto'

        if mode == 'min':
            self.monitor_op = np.less
            self.best = np.Inf
        elif mode == 'max':
            self.monitor_op = np.greater
            self.best = -np.Inf
        else:
            if 'acc' in self.monitor or self.monitor.startswith('fmeasure'):
                self.monitor_op = np.greater
                self.best = -np.Inf
            else:
                self.monitor_op = np.less
                self.best = np.Inf

    def on_epoch_end(self, epoch, logs=None):
        logs = logs or {}
        self.epochs_since_last_save += 1
        if self.epochs_since_last_save >= self.period:
            self.epochs_since_last_save = 0
            filepath = self.filepath.format(epoch=epoch + 1, **logs)
            if self.save_best_only:
                current = logs.get(self.monitor)
                if current is None:
                    warnings.warn('Can save best model only with %s available, '
                                  'skipping.' % self.monitor, RuntimeWarning)
                else:
                    if self.monitor_op(current, self.best):
                        if self.verbose > 0:
                            print('\nEpoch %05d: %s improved from %0.5f to %0.5f,'
                                  ' saving model to %s'
                                  % (epoch + 1, self.monitor, self.best,
                                     current, filepath))
                        self.best = current
                        if self.save_weights_only:
                            self.model.save_weights(filepath, overwrite=True)
                        else:
                            self.model.save(filepath, overwrite=True)
                    else:
                        if self.verbose > 0:
                            print('\nEpoch %05d: %s did not improve' %
                                  (epoch + 1, self.monitor))
            else:
                if self.verbose > 0:
                    print('\nEpoch %05d: saving model to %s' % (epoch + 1, filepath))
                if self.save_weights_only:
                    self.model.save_weights(filepath, overwrite=True)
                else:
                    self.model.save(filepath, overwrite=True)

生成先验框

import numpy as np
from itertools import product


class Anchors(object):
    def __init__(self, cfg, image_size=None):
        super(Anchors, self).__init__()
        # 不同的有效特征层对应先验框的尺寸
        self.min_sizes = cfg['min_sizes']
        # 每个有效特征层被压缩的倍数
        self.steps = cfg['steps']
        # 是否将先验框调整到0-1之间
        self.clip = cfg['clip']
        # 图片的尺寸
        self.image_size = image_size
        # 三个有效特征层的高和宽
        self.feature_maps = [[np.ceil(self.image_size[0] / step)
                                 , np.ceil(self.image_size[1] / step)] for step in self.steps]

    # 生成先验框
    def get_anchors(self):
        anchors = []
        for k, f in enumerate(self.feature_maps):
            # 取出有效特征层对应的先验框尺寸
            min_sizes = self.min_sizes[k]
            # 在每个有效特征层上生成先验框
            for i, j in product(range(f[0]), range(f[1])):
                for min_size in min_sizes:
                    # 生成先验框的宽和高
                    s_w = min_size / self.image_size[1]
                    s_h = min_size / self.image_size[0]
                    # 生成先验框的中心点坐标，j+0.5,i+0.5目的是使x,y不为0
                    dense_x = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]]
                    dense_y = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]]
                    for centre_x, centre_y in product(dense_x, dense_y):
                        anchors += [centre_x, centre_y, s_w, s_h]
        # 对先验框进行整理
        anchors = np.reshape(anchors, [-1, 4])
        output = np.zeros_like(anchors)
        # 将先验框的形式转换成左上角右下角的形式
        output[:, 0] = anchors[:, 0] - anchors[:, 2] / 2
        output[:, 1] = anchors[:, 1] - anchors[:, 3] / 2
        output[:, 2] = anchors[:, 0] - anchors[:, 2] / 2
        output[:, 3] = anchors[:, 1] - anchors[:, 3] / 2

        if self.clip:
            output = np.clip(output, 0, 1)
        return output

训练代码

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
from nets.retinaface import Retinaface
from utils.anchors import Anchors
from utils.config import cfg_mnet
from utils.utils import ModelCheckpoint

if __name__ == '__main__':
    # 获得训练用的人脸标签与坐标
    train_dataset_path = ''
    # 主干特征提取网络的选择，可选mobilenet025或者resnet50，这里以mobilenet025为例进行训练
    backbone = 'mobilenet'

    if backbone == 'mobilenet':
        cfg = cfg_mnet
        # 模型可训练层数
        freeze_layers = 81

    else:
        raise ValueError(f'Unsupported backbone -{backbone},User mobilenet,resnet50')

    # 训练图片的大小
    img_dim = cfg['train_image_size']

    model = Retinaface(cfg, backbone)
    model_path = 'model_data/'
    # 若想将权重载入不同的网络结构模型中，设置by_name=True来载入那些名字相同的层的权重
    # by_name=False则不是按照层的名字来存储，是按照网络的拓扑结构(模型结构需与保存权重时的结构一致)来存储
    # skip_mismatch参数只有在by_name=True时才生效,跳过权重数量或者权重形状不匹配的层
    model.load_weights(model_path, by_name=True, skip_mismatch=True)
    # 获得先验框和工具箱
    anchors = Anchors(cfg=cfg, image_size=(img_dim, img_dim)).get_anchors()
    bbox_util = BBoxUtility(anchors)

    # 训练参数设置
    # logging表示tensorboard的保存地址,将日志信息写入TensorBoard,使得你可以动态的观察训练和测试指标图像以及不同层的激活值直方图
    logging = TensorBoard(log_dir='logs')
    # checkpoint用于每次epoch后将权重保存到filepath中，period用于多少epoch保存一次，比如period=2，则2个epoch保存一次
    # save_weights_only=True表示只保存权重
    # monitor = 'loss' 需要监视loss值的变化
    # 若save_best_only=True表示只保存验证集上性能最好的模型
    # model在save_best_only=True时作为模型性能是否达到最佳的评判标准
    # 例如monitor='val_acc'，model='max'，monitor='val_loss'，model='min',model='auto',根据monitor自动推断评判标准

    checkpoint = ModelCheckpoint('logs/ep{epoch:03d}--loss{loss:.3f}.h5',
                                 monitor='loss',
                                 save_weights_only=True,
                                 save_best_only=False,
                                 period=1,
                                 mode='auto')
    # reduce_lr用于设置学习率下降的方式,每一轮的学习率都会下降百分之8
    reduce_lr = ExponentDecayScheduler(decay_rate=0.92, verbose=1)
    # early_stopping用于设定早停，val_loss多次不下降自动结束训练，表示模型基本收敛
    # monitor表示监测的量，patience表示几个epoch后损失不下降则停止
    early_stopping = EarlyStopping(monitor='loss', min_delta=0, patience=6, verbose=1)
    # 冻结每一层，若设置为True，则表示解冻
    for i in range(freeze_layers): model.layers[i].trainable = False
    print('Freeze the first {} layers of total {} layers.'.format(freeze_layers, len(model.layers)))
    # 主干特征提取网络特征通用，冻结训练可以加快训练速度
    # 也可以在训练初期防止权值被破坏
    if True:
        batch_size = 8
        Init_epoch = 0
        Freeze_epoch = 50
        learning_rate_base = 1e-3
        gen = Generator(train_dataset_path, img_dim, batch_size, bbox_util)
        model.compile(loss={
            'bbox_reg': box_smooth_l1(weights=cfg['loc_weight']),
            'cls': conf_loss(),
            'ldm_reg': ldm_smooth_l1()
        }, optimizer=keras.optimizers.Adam(lr=learning_rate_base))
        model.fit(gen,
                  # 1个epoch包含的步数,每一步训练一个batch_size大小的样本
                  steps_per_epoch=gen.get_len() // batch_size,
                  verbose=1,  # 1表示输出进度条信息
                  epochs=Freeze_epoch,  # 训练停止的轮数
                  initial_epoch=Init_epoch,  # 训练总轮数为len(epochs)-len(initial_epoch)
                  callbacks=[logging, checkpoint, reduce_lr, early_stopping])
    # 解冻
    for i in range(freeze_layers): model.layers[i].trainable = True

    if True:
        batch_size = 4
        Freeze_epoch = 50
        Epoch = 100
        learning_rate_base = 1e-4
        gen = Generator(train_dataset_path, img_dim, batch_size, bbox_util)

        model.compile(loss={
            'bbox_reg': box_smooth_l1(weights=cfg['loc_weight']),
            'cls': conf_loss(),
            'ldm_reg': ldm_smooth_l1()
        }, optimizer=keras.optimizers.Adam(lr=learning_rate_base))
        model.fit(gen,
                  steps_per_epoch=gen.get_len() // batch_size,
                  verbose=1,
                  epochs=Epoch,
                  Init_epoch=Freeze_epoch,
                  callbacks=[logging, checkpoint, reduce_lr, early_stopping])

更详细的介绍请查阅参考链接下的博文

参考链接

睿智的目标检测43——TF2搭建Retinaface人脸检测与关键点定位平台（tensorflow2）

未完待续，随笔，学完之后整理记录