unimrcp本身提供了一个简单的VAD算法模块,这里并不讨论这个算法的好坏,只记录一下它的用法流程。
算法实现代码:ibs/mpf/src/mpf_activity_detector.c
使用时,调用mpf_activity_detector_create()创建,以demo_recog_engine为例:
/* create demo recog channel */
demo_recog_channel_t *recog_channel = apr_palloc(pool,sizeof(demo_recog_channel_t));
recog_channel->demo_engine = engine->obj;
recog_channel->recog_request = NULL;
recog_channel->stop_response = NULL;
recog_channel->detector = mpf_activity_detector_create(pool);
recog_channel->audio_out = NULL;
mpf_activity_detector_create的实现代码:
/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool)
{
mpf_activity_detector_t *detector = apr_palloc(pool,sizeof(mpf_activity_detector_t));
detector->level_threshold = 2; /* 0 .. 255 */
detector->speech_timeout = 300; /* 0.3 s */
detector->silence_timeout = 300; /* 0.3 s */
detector->noinput_timeout = 5000; /* 5 s */
detector->duration = 0;
detector->state = DETECTOR_STATE_INACTIVITY;
return detector;
}
就是初始化detector的参数,包括能量阈值、状态转换的时间阈值。
这些值,都提供了设置的接口:
/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool);
/** Reset activity detector */
MPF_DECLARE(void) mpf_activity_detector_reset(mpf_activity_detector_t *detector);
/** Set threshold of voice activity (silence) level */
MPF_DECLARE(void) mpf_activity_detector_level_set(mpf_activity_detector_t *detector, apr_size_t level_threshold);
/** Set noinput timeout */
MPF_DECLARE(void) mpf_activity_detector_noinput_timeout_set(mpf_activity_detector_t *detector, apr_size_t noinput_timeout);
/** Set timeout required to trigger speech (transition from inactive to active state) */
MPF_DECLARE(void) mpf_activity_detector_speech_timeout_set(mpf_activity_detector_t *detector, apr_size_t speech_timeout);
/** Set timeout required to trigger silence (transition from active to inactive state) */
MPF_DECLARE(void) mpf_activity_detector_silence_timeout_set(mpf_activity_detector_t *detector, apr_size_t silence_timeout);
如果使用1.7里缺省的level_threshold 2,我的实测结果是收不到打断事件。
收到媒体包时,调用mpf_activity_detector_process处理,里面维护了一个状态机:
if(recog_channel->recog_request) {
mpf_detector_event_e det_event = mpf_activity_detector_process(recog_channel->detector,frame);
switch(det_event) {
case MPF_DETECTOR_EVENT_ACTIVITY:
apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Activity " APT_SIDRES_FMT,
MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
demo_recog_start_of_input(recog_channel);
break;
case MPF_DETECTOR_EVENT_INACTIVITY:
apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Inactivity " APT_SIDRES_FMT,
MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_SUCCESS);
break;
case MPF_DETECTOR_EVENT_NOINPUT:
apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Noinput " APT_SIDRES_FMT,
MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
if(recog_channel->timers_started == TRUE) {
demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_NO_INPUT_TIMEOUT);
}
break;
default:
break;
/** Process current frame */
MPF_DECLARE(mpf_detector_event_e) mpf_activity_detector_process(mpf_activity_detector_t *detector, const mpf_frame_t *frame)
{
mpf_detector_event_e det_event = MPF_DETECTOR_EVENT_NONE;
apr_size_t level = 0;
if((frame->type & MEDIA_FRAME_TYPE_AUDIO) == MEDIA_FRAME_TYPE_AUDIO) {
/* first, calculate current activity level of processed frame */
level = mpf_activity_detector_level_calculate(frame);
#if 0
apt_log(APT_LOG_MARK,APT_PRIO_INFO,"Activity Detector [%"APR_SIZE_T_FMT"]",level);
#endif
}
if(detector->state == DETECTOR_STATE_INACTIVITY) {
if(level >= detector->level_threshold) {
/* start to detect activity */
mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY_TRANSITION);
}
else {
detector->duration += CODEC_FRAME_TIME_BASE;
if(detector->duration >= detector->noinput_timeout) {
/* detected noinput */
det_event = MPF_DETECTOR_EVENT_NOINPUT;
}
}
}
else if(detector->state == DETECTOR_STATE_ACTIVITY_TRANSITION) {
if(level >= detector->level_threshold) {
detector->duration += CODEC_FRAME_TIME_BASE;
if(detector->duration >= detector->speech_timeout) {
/* finally detected activity */
det_event = MPF_DETECTOR_EVENT_ACTIVITY;
mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);
}
}
else {
/* fallback to inactivity */
mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
}
}
else if(detector->state == DETECTOR_STATE_ACTIVITY) {
if(level >= detector->level_threshold) {
detector->duration += CODEC_FRAME_TIME_BASE;
}
else {
/* start to detect inactivity */
mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY_TRANSITION);
}
}
else if(detector->state == DETECTOR_STATE_INACTIVITY_TRANSITION) {
if(level >= detector->level_threshold) {
/* fallback to activity */
mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);
}
else {
detector->duration += CODEC_FRAME_TIME_BASE;
if(detector->duration >= detector->silence_timeout) {
/* detected inactivity */
det_event = MPF_DETECTOR_EVENT_INACTIVITY;
mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
}
}
}
return det_event;
}
具体算法调用就是mpf_activity_detector_level_calculate()
static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame)
{
apr_size_t sum = 0;
apr_size_t count = frame->codec_frame.size/2;
const apr_int16_t *cur = frame->codec_frame.buffer;
const apr_int16_t *end = cur + count;
for(; cur < end; cur++) {
if(*cur < 0) {
sum -= *cur;
}
else {
sum += *cur;
}
}
return sum / count;
}
对于这个算法,不需要太较真,累加求其平均值,如果大于阈值,表示有声音,如果不大于,表示静音。并没有噪音检测。如果生产需要,肯定是需要修改的。