unimrcp的voice activity dector

戈嘉慕

2023-12-01

unimrcp本身提供了一个简单的VAD算法模块，这里并不讨论这个算法的好坏，只记录一下它的用法流程。

算法实现代码：ibs/mpf/src/mpf_activity_detector.c

使用时，调用mpf_activity_detector_create()创建，以demo_recog_engine为例：

    /* create demo recog channel */
	demo_recog_channel_t *recog_channel = apr_palloc(pool,sizeof(demo_recog_channel_t));
	recog_channel->demo_engine = engine->obj;
	recog_channel->recog_request = NULL;
	recog_channel->stop_response = NULL;
	recog_channel->detector = mpf_activity_detector_create(pool);
	recog_channel->audio_out = NULL;

mpf_activity_detector_create的实现代码：

/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool)
{
	mpf_activity_detector_t *detector = apr_palloc(pool,sizeof(mpf_activity_detector_t));
	detector->level_threshold = 2; /* 0 .. 255 */
	detector->speech_timeout = 300; /* 0.3 s */
	detector->silence_timeout = 300; /* 0.3 s */
	detector->noinput_timeout = 5000; /* 5 s */
	detector->duration = 0;
	detector->state = DETECTOR_STATE_INACTIVITY;
	return detector;
}

就是初始化detector的参数，包括能量阈值、状态转换的时间阈值。

这些值，都提供了设置的接口：

/** Create activity detector */
MPF_DECLARE(mpf_activity_detector_t*) mpf_activity_detector_create(apr_pool_t *pool);

/** Reset activity detector */
MPF_DECLARE(void) mpf_activity_detector_reset(mpf_activity_detector_t *detector);

/** Set threshold of voice activity (silence) level */
MPF_DECLARE(void) mpf_activity_detector_level_set(mpf_activity_detector_t *detector, apr_size_t level_threshold);

/** Set noinput timeout */
MPF_DECLARE(void) mpf_activity_detector_noinput_timeout_set(mpf_activity_detector_t *detector, apr_size_t noinput_timeout);

/** Set timeout required to trigger speech (transition from inactive to active state) */
MPF_DECLARE(void) mpf_activity_detector_speech_timeout_set(mpf_activity_detector_t *detector, apr_size_t speech_timeout);

/** Set timeout required to trigger silence (transition from active to inactive state) */
MPF_DECLARE(void) mpf_activity_detector_silence_timeout_set(mpf_activity_detector_t *detector, apr_size_t silence_timeout);

如果使用1.7里缺省的level_threshold 2，我的实测结果是收不到打断事件。

收到媒体包时，调用mpf_activity_detector_process处理，里面维护了一个状态机：

if(recog_channel->recog_request) {
		mpf_detector_event_e det_event = mpf_activity_detector_process(recog_channel->detector,frame);
		switch(det_event) {
			case MPF_DETECTOR_EVENT_ACTIVITY:
				apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Activity " APT_SIDRES_FMT,
					MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
				demo_recog_start_of_input(recog_channel);
				break;
			case MPF_DETECTOR_EVENT_INACTIVITY:
				apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Voice Inactivity " APT_SIDRES_FMT,
					MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
				demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_SUCCESS);
				break;
			case MPF_DETECTOR_EVENT_NOINPUT:
				apt_log(RECOG_LOG_MARK,APT_PRIO_INFO,"Detected Noinput " APT_SIDRES_FMT,
					MRCP_MESSAGE_SIDRES(recog_channel->recog_request));
				if(recog_channel->timers_started == TRUE) {
					demo_recog_recognition_complete(recog_channel,RECOGNIZER_COMPLETION_CAUSE_NO_INPUT_TIMEOUT);
				}
				break;
			default:
				break;

/** Process current frame */
MPF_DECLARE(mpf_detector_event_e) mpf_activity_detector_process(mpf_activity_detector_t *detector, const mpf_frame_t *frame)
{
	mpf_detector_event_e det_event = MPF_DETECTOR_EVENT_NONE;
	apr_size_t level = 0;
	if((frame->type & MEDIA_FRAME_TYPE_AUDIO) == MEDIA_FRAME_TYPE_AUDIO) {
		/* first, calculate current activity level of processed frame */
		level = mpf_activity_detector_level_calculate(frame);
#if 0
		apt_log(APT_LOG_MARK,APT_PRIO_INFO,"Activity Detector [%"APR_SIZE_T_FMT"]",level);
#endif
	}

	if(detector->state == DETECTOR_STATE_INACTIVITY) {
		if(level >= detector->level_threshold) {
			/* start to detect activity */
			mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY_TRANSITION);
		}
		else {
			detector->duration += CODEC_FRAME_TIME_BASE;
			if(detector->duration >= detector->noinput_timeout) {
				/* detected noinput */
				det_event = MPF_DETECTOR_EVENT_NOINPUT;
			}
		}
	}
	else if(detector->state == DETECTOR_STATE_ACTIVITY_TRANSITION) {
		if(level >= detector->level_threshold) {
			detector->duration += CODEC_FRAME_TIME_BASE;
			if(detector->duration >= detector->speech_timeout) {
				/* finally detected activity */
				det_event = MPF_DETECTOR_EVENT_ACTIVITY;
				mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);
			}
		}
		else {
			/* fallback to inactivity */
			mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
		}
	}
	else if(detector->state == DETECTOR_STATE_ACTIVITY) {
		if(level >= detector->level_threshold) {
			detector->duration += CODEC_FRAME_TIME_BASE;
		}
		else {
			/* start to detect inactivity */
			mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY_TRANSITION);
		}
	}
	else if(detector->state == DETECTOR_STATE_INACTIVITY_TRANSITION) {
		if(level >= detector->level_threshold) {
			/* fallback to activity */
			mpf_activity_detector_state_change(detector,DETECTOR_STATE_ACTIVITY);
		}
		else {
			detector->duration += CODEC_FRAME_TIME_BASE;
			if(detector->duration >= detector->silence_timeout) {
				/* detected inactivity */
				det_event = MPF_DETECTOR_EVENT_INACTIVITY;
				mpf_activity_detector_state_change(detector,DETECTOR_STATE_INACTIVITY);
			}
		}
	}

	return det_event;
}

具体算法调用就是mpf_activity_detector_level_calculate()

static apr_size_t mpf_activity_detector_level_calculate(const mpf_frame_t *frame)
{
	apr_size_t sum = 0;
	apr_size_t count = frame->codec_frame.size/2;
	const apr_int16_t *cur = frame->codec_frame.buffer;
	const apr_int16_t *end = cur + count;

	for(; cur < end; cur++) {
		if(*cur < 0) {
			sum -= *cur;
		}
		else {
			sum += *cur;
		}
	}

	return sum / count;
}

对于这个算法，不需要太较真，累加求其平均值，如果大于阈值，表示有声音，如果不大于，表示静音。并没有噪音检测。如果生产需要，肯定是需要修改的。

unimrcp的voice activity dector

相关阅读

相关文章

相关问答

相关文档