最近在做360环视的过程中碰到了图像融合的问题,由于板子上cpu算力有限,最终尝试用neon加速一下,最后总体算下来大概有5ms的加速,融合源图像越大,加速效果越明显,大致思路如下:
1.由于图像是切片引用过来的,然而传进寄存器需要一块连续的内存,因此这里深拷贝clone一下,图像过小,不建议使用这种方式,因为clone也会增加耗时,融合带来的提升会比clone小;
2.为了一次性处理八个数据,由于精度要求不高,所以采用定点优化将将权重取整到0~255,后右移八位还原
Mat StichManage::m_merge_neon(cv::Mat& imA, cv::Mat& imB, Mat& weight){
Mat imA_clone = imA.clone();
Mat imB_clone = imB.clone();
uint8x8_t rfac = vdup_n_u8(255);
int sum = imA.rows*imA.cols/8;
int yushu = imA.rows*imA.cols % 8;//不足八位的单独处理
uchar* p_data_a = imA_clone.data;
uchar* p_data_b = imB_clone.data;
uchar* p_weight = weight.data;
uint16x8_t temp1, temp2, temp3;
uint8x8x3_t rgb1, rgb2;
uint8x8_t weight1, weight2;
for(int i = 0; i < sum; i++)
{
rgb1 = vld3_u8(p_data_a);
rgb2 = vld3_u8(p_data_b);
weight1 = vld1_u8(p_weight);
weight2 = vsub_u8(rfac, weight1);
temp1 = vmull_u8(rgb1.val[0], weight1);
temp1 = vmlal_u8(temp1, rgb2.val[0], weight2);
rgb1.val[0] = vshrn_n_u16 (temp1, 8);
temp2 = vmull_u8(rgb1.val[1], weight1);
temp2 = vmlal_u8(temp2, rgb2.val[1], weight2);
rgb1.val[1] = vshrn_n_u16 (temp2, 8);
temp3 = vmull_u8(rgb1.val[2], weight1);
temp3 = vmlal_u8(temp3, rgb2.val[2], weight2);
rgb1.val[2] = vshrn_n_u16 (temp3, 8);
vst3_u8(p_data_a, rgb1);
p_data_a += 24;
p_data_b += 24;
p_weight += 8;
}
for(int i = 0; i < yushu; i++)
{
float weight1 = *p_weight / 255.0f;
float weight2 = 1 - weight1;
for(int j = 0; j< 3; j++)
{
*p_data_a = (*p_data_a)* weight + (*p_data_b)* weight2;
p_data_a ++;
p_data_b ++;
}
p_weight ++;
}
return imA_clone;
}