硬件平台:海思3516CV500
编译条件:O3 + NEON
#include <stdio.h>
#include <arm_neon.h>
#include <sys/time.h>
#include "NE10.h"
#include "NE10_sample_intro.h"
#include "alg_test.h"
float sum_array(float *arr1, float *arr2,int len)
{
float sum1, sum2, result;
int i;
for(i=0; i<len; ++i)
{
sum1 += *arr1++;
}
for(i=0; i<len; ++i)
{
sum2 += *arr2++;
}
result = sum1 + sum2;
return result;
}
float sum_array_neon(float *arr1, float *arr2,int len)
{
int dim4 = len >> 2; // 数组长度除4整数
int left4 = len & 3; // 数组长度除4余数
float32x4_t sum_vec1 = vdupq_n_f32(0.0);//定义用于暂存累加结果的寄存器且初始化为0
float32x4_t sum_vec2 = vdupq_n_f32(0.0);
for (; dim4>0; dim4--, arr1+=4) //每次同时访问4个数组元素
{
float32x4_t data_vec1 = vld1q_f32(arr1); //依次取4个元素存入寄存器vec
sum_vec1 = vaddq_f32(sum_vec1, data_vec1);//ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
}
//将累加结果寄存器中的所有元素相加得到最终累加值
float sum1 = vgetq_lane_f32(sum_vec1, 0)+vgetq_lane_f32(sum_vec1, 1)+vgetq_lane_f32(sum_vec1, 2)+vgetq_lane_f32(sum_vec1, 3);
//对于剩下的少于4的数字,依次计算累加即可
for (; left4>0; left4--, arr1++)
sum1 += (*arr1) ;
dim4 = len >> 2; // 数组长度除4整数
left4 = len & 3; // 数组长度除4余数
// printf("%d, %d \n",dim4, left4 );
for (; dim4>0; dim4--, arr2+=4) //每次同时访问4个数组元素
{
float32x4_t data_vec2 = vld1q_f32(arr2); //依次取4个元素存入寄存器vec
sum_vec2 = vaddq_f32(sum_vec2, data_vec2);//ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
}
float sum2 = vgetq_lane_f32(sum_vec2, 0)+vgetq_lane_f32(sum_vec2, 1)+vgetq_lane_f32(sum_vec2, 2)+vgetq_lane_f32(sum_vec2, 3);
for (; left4>0; left4--, arr2++)
sum2 += (*arr2) ;
// printf("%d, %d, %f \n", dim4, left4, sum2 );
float result = sum1 + sum2;
// printf("%f, %f %f \n", sum1,sum2,result);
return result;
}
#define size 10240000
//float data1[size];
//float data2[size];
int main(void)
{
if (ne10_init() != NE10_OK)
{
// fprintf(stderr, "Failed to initialise Ne10.\n");
printf("ne10_init failed ! \n");
return 1;
}
printf("ne10_init ok ! \n");
/*
printf("-------- data types test -------- \n");
printf("----float = %d \n",sizeof(float));//float = 4
printf("----short = %d \n",sizeof(short));//short = 2
printf("----int = %d \n",sizeof(int)); //int = 4
*/
struct timeval start,end;
float *data1 = (float*)malloc(size*sizeof(float));
float *data2 = (float*)malloc(size*sizeof(float));
int i;
float sum_result;
for(i=0; i<size; i++)
{
data1[i] = 1;//(unsigned short)rand()%(10);
data2[i] = 1;//(unsigned short)rand()%(10);
}
printf("-------- SUM_ARRAY_TEST_C test -------- \r\n");
sum_result = 0;
gettimeofday(&start,NULL);
sum_result = sum_array(data1, data2, size);
gettimeofday(&end,NULL);
printf("sum_result = %f \n", sum_result);
printf("Alg C use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
printf("-------- SUM_ARRAY_TEST_NEON test -------- \r\n");
gettimeofday(&start,NULL);
sum_result = sum_array_neon(data1, data2, size);
gettimeofday(&end,NULL);
printf("sum_result = %f \n", sum_result);
printf("Alg NEON use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
free(data1);
free(data2);
// printf("# Introduction\n");
// intro_sample_main();
printf("\n");
my_alg_test();
return 0;
}
测试结果:
/mnt/fuhang/ne10_test/test00 # ./ne10_test
ne10_init ok !
-------- SUM_ARRAY_TEST_C test --------
sum_result = 20480000.000000
Alg C use time 86716us
-------- SUM_ARRAY_TEST_NEON test --------
sum_result = 20480000.000000
Alg NEON use time 86667us
说明简单的数学运算开启O3编译器优化会自动进行代码的向量化,实际结果比代码手工向量化差别不大。
#include <stdio.h>
#include <arm_neon.h>
#include <sys/time.h>
#include "NE10.h"
#include "alg_test.h"
/*
test for complex alg
for (i = 0; i < n_img_size; i++)
{
temp[i] = (n_w_array[i] * g_pusTffLastFrame[i] + (4096 - n_w_array[i]) * pus_src[i]) >> 12;
}
*/
short algtest_c(short *arr1, short *arr2, short *arr3, short *result, int len)
{
int i;
for (i = 0; i < len; i++)
{
result[i] = (arr3[i] * arr1[i] + (4096 - arr3[i]) * arr2[i]) >> 12;
}
return 0;
}
// short : 2^15 = 32768 = 0x8000 = 1000 0000 0000 0000
//unsigned short : 2^16 = 65535 = 0xFFFF = 1111 1111 1111 1111
short algtest_neon(unsigned short *arr1, unsigned short *arr2, unsigned short *arr3, unsigned short *result, int len)
{
int dim8 = len >> 3; // 数组长度除8整数
uint16x8_t mul1_vec = vdupq_n_u16(0);//定义用于暂存累加结果的寄存器且初始化为0
uint16x8_t mul2_vec = vdupq_n_u16(0);
uint16x8_t a1_vec = vdupq_n_u16(4096);
uint16x8_t tempsub_vec = vdupq_n_u16(0);
uint16x8_t tempadd_vec = vdupq_n_u16(0);
uint16x8_t tempresult_vec = vdupq_n_u16(0);
unsigned short *p8_short = result;
for (; dim8>0; dim8--) //每次同时访问8个数组元素
{
uint16x8_t data1_vec = vld1q_u16(arr1); //依次取4个元素存入寄存器vec
uint16x8_t data2_vec = vld1q_u16(arr2);
uint16x8_t data3_vec = vld1q_u16(arr3);
mul1_vec = vmulq_u16(data1_vec, data3_vec);//ri = ai * bi
// vst1q_s16(p8_short, mul1_vec);//fortest----------
tempsub_vec = vsubq_u16(a1_vec, data3_vec);//ri = ai - bi
mul2_vec = vmulq_u16(tempsub_vec, data2_vec);
tempadd_vec = vaddq_u16(mul1_vec, mul2_vec);
// vst1q_u16(p8_short, tempadd_vec);//fortest----------
tempresult_vec = vshrq_n_u16(tempadd_vec, 12);
vst1q_u16(p8_short, tempresult_vec);
arr1 += 8;
arr2 += 8;
arr3 += 8;
p8_short += 8;
}
return 0;
}
#define data_len 10240000 //1024*768=786432
int my_alg_test(void)
{
printf("my_alg_test ! \n");
short *in_data1 = (short*)malloc(data_len*sizeof(short));
short *in_data2 = (short*)malloc(data_len*sizeof(short));
short *in_data3 = (short*)malloc(data_len*sizeof(short));
short *out_data1 = (short*)malloc(data_len*sizeof(short));
struct timeval start,end;
int i;
short sum_result;
for(i=0;i<data_len;i++)
{
in_data1[i] = (unsigned short)rand()%(128) + 0 ;
in_data2[i] = (unsigned short)rand()%(4) + 0 ;
in_data3[i] = (unsigned short)rand()%(128) + 0 ;
out_data1[i] = 0;
}
/*
for(i=0;i<data_len;i++)
{
printf("%d ", in_data1[i]);
}
printf("\r\n ");
for(i=0;i<data_len;i++)
{
printf("%d ", in_data2[i]);
}
printf("\r\n ");
for(i=0;i<data_len;i++)
{
printf("%d ", in_data3[i]);
}
printf("\r\n ");
*/
printf("-------- algtest_c test -------- \r\n");
gettimeofday(&start,NULL);
algtest_c(in_data1, in_data2, in_data3, out_data1, data_len);
gettimeofday(&end,NULL);
printf("algtest_c use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
// for(i=0;i<data_len;i++)
{
// printf("%d ", out_data1[i]);
}
printf("\r\n ");
for(i=0;i<data_len;i++)
{
out_data1[i] = 0;
}
printf("-------- algtest_neon test -------- \r\n");
sum_result = 0;
gettimeofday(&start,NULL);
algtest_neon((unsigned short *)in_data1, (unsigned short *)in_data2, (unsigned short *)in_data3, (unsigned short *)out_data1, data_len);
gettimeofday(&end,NULL);
printf("algtest_neon use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
// for(i=0;i<data_len;i++)
{
// printf("%d ", out_data1[i]);
}
printf("\r\n ");
free(in_data1);
free(in_data2);
free(in_data3);
free(out_data1);
return 0;
}
测试结果:
my_alg_test !
-------- algtest_c test --------
algtest_c use time 187637us
-------- algtest_neon test --------
algtest_neon use time 155556us
对于比较复杂的数学运算,一般的O3编译性能与手工向量化的代码比起来,数据量越大,手工向量化的效果越好,数据量越小,则体现不出来优势。同时要特别注意数据大小是否会溢出,防止出现向量化后计算错误。