当前位置: 首页 > 工具软件 > Ne10 > 使用案例 >

NE10算法测试demo1

公沈浪
2023-12-01

测试环境

硬件平台:海思3516CV500

编译条件:O3 + NEON

test1

#include <stdio.h>
#include <arm_neon.h>
#include <sys/time.h>
#include "NE10.h"
#include "NE10_sample_intro.h"
#include "alg_test.h"

float sum_array(float *arr1, float *arr2,int len)
{
    float sum1, sum2, result;
    int i;
    
    for(i=0; i<len; ++i)
    {
        sum1 += *arr1++;
    }
    for(i=0; i<len; ++i)
    {
        sum2 += *arr2++;
    }
    result = sum1 + sum2;
    return result;
}

float sum_array_neon(float *arr1, float *arr2,int len)
{
    int dim4 = len >> 2; // 数组长度除4整数
    int left4 = len & 3; // 数组长度除4余数
    float32x4_t sum_vec1 = vdupq_n_f32(0.0);//定义用于暂存累加结果的寄存器且初始化为0
    float32x4_t sum_vec2 = vdupq_n_f32(0.0);
    
    for (; dim4>0; dim4--, arr1+=4) //每次同时访问4个数组元素
    {
        float32x4_t data_vec1 = vld1q_f32(arr1); //依次取4个元素存入寄存器vec
        sum_vec1 = vaddq_f32(sum_vec1, data_vec1);//ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
    }
    //将累加结果寄存器中的所有元素相加得到最终累加值
    float sum1 = vgetq_lane_f32(sum_vec1, 0)+vgetq_lane_f32(sum_vec1, 1)+vgetq_lane_f32(sum_vec1, 2)+vgetq_lane_f32(sum_vec1, 3);
    //对于剩下的少于4的数字,依次计算累加即可
    for (; left4>0; left4--, arr1++)
        sum1 += (*arr1) ;       
    
    dim4 = len >> 2; // 数组长度除4整数
    left4 = len & 3; // 数组长度除4余数    
//    printf("%d, %d  \n",dim4, left4 );     
    for (; dim4>0; dim4--, arr2+=4) //每次同时访问4个数组元素
    {
        float32x4_t data_vec2 = vld1q_f32(arr2); //依次取4个元素存入寄存器vec
        sum_vec2 = vaddq_f32(sum_vec2, data_vec2);//ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
    }
    

    float sum2 = vgetq_lane_f32(sum_vec2, 0)+vgetq_lane_f32(sum_vec2, 1)+vgetq_lane_f32(sum_vec2, 2)+vgetq_lane_f32(sum_vec2, 3);
    for (; left4>0; left4--, arr2++)
        sum2 += (*arr2) ;  
        
//    printf("%d, %d, %f  \n", dim4, left4, sum2 );     
    
    float result = sum1 + sum2;   
    
//    printf("%f, %f %f \n", sum1,sum2,result); 
    
    return result;
}

#define size 10240000
//float data1[size];
//float data2[size];
int main(void)
{

    if (ne10_init() != NE10_OK)
    {
//	    fprintf(stderr, "Failed to initialise Ne10.\n");
        printf("ne10_init failed ! \n");
	    return 1;
    }
    printf("ne10_init ok ! \n");

/*    
    printf("-------- data types test -------- \n");
    printf("----float = %d \n",sizeof(float));//float = 4
    printf("----short = %d \n",sizeof(short));//short = 2
    printf("----int = %d \n",sizeof(int));    //int = 4
*/    
 
     struct timeval start,end;
     
     float *data1 = (float*)malloc(size*sizeof(float));
     float *data2 = (float*)malloc(size*sizeof(float));

       
     int i;
     float sum_result;
     for(i=0; i<size; i++)
     {   
        data1[i] = 1;//(unsigned short)rand()%(10);
        data2[i] = 1;//(unsigned short)rand()%(10);
     }  
     
     printf("-------- SUM_ARRAY_TEST_C test -------- \r\n");
     sum_result = 0;
     gettimeofday(&start,NULL);
     sum_result = sum_array(data1, data2, size);     
     gettimeofday(&end,NULL);  
     printf("sum_result = %f \n", sum_result);
     printf("Alg C use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));        
     
     printf("-------- SUM_ARRAY_TEST_NEON test -------- \r\n");   
     gettimeofday(&start,NULL);
     sum_result = sum_array_neon(data1, data2, size);
     gettimeofday(&end,NULL);  
     printf("sum_result = %f \n", sum_result);
     printf("Alg NEON use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
 

          
     free(data1);
     free(data2);
//    printf("# Introduction\n");
//    intro_sample_main();
    printf("\n");
    
    
    my_alg_test();
     
     return 0;
}

测试结果:

/mnt/fuhang/ne10_test/test00 # ./ne10_test
ne10_init ok ! 
-------- SUM_ARRAY_TEST_C test -------- 
sum_result = 20480000.000000 
Alg C use time 86716us
-------- SUM_ARRAY_TEST_NEON test -------- 
sum_result = 20480000.000000 
Alg NEON use time 86667us

        说明简单的数学运算开启O3编译器优化会自动进行代码的向量化,实际结果比代码手工向量化差别不大。

test2

#include <stdio.h>
#include <arm_neon.h>
#include <sys/time.h>
#include "NE10.h"
#include "alg_test.h"

/*
    test for complex alg
	for (i = 0; i < n_img_size; i++)
	{
		temp[i] = (n_w_array[i] * g_pusTffLastFrame[i] + (4096 - n_w_array[i]) * pus_src[i]) >> 12;		
	}
*/


short algtest_c(short *arr1, short *arr2, short *arr3, short *result, int len)
{
    int i;
	for (i = 0; i < len; i++)
	{
		result[i] = (arr3[i] * arr1[i] + (4096 - arr3[i]) * arr2[i]) >> 12;		
	}
    return 0;
}

//         short : 2^15 = 32768 = 0x8000 = 1000 0000 0000 0000
//unsigned short : 2^16 = 65535 = 0xFFFF = 1111 1111 1111 1111
short algtest_neon(unsigned short *arr1, unsigned short *arr2, unsigned short *arr3, unsigned short *result, int len)
{

    int dim8 = len >> 3; // 数组长度除8整数

    uint16x8_t mul1_vec = vdupq_n_u16(0);//定义用于暂存累加结果的寄存器且初始化为0
    uint16x8_t mul2_vec = vdupq_n_u16(0);
    uint16x8_t a1_vec = vdupq_n_u16(4096);
    uint16x8_t tempsub_vec = vdupq_n_u16(0);
    uint16x8_t tempadd_vec = vdupq_n_u16(0);
    uint16x8_t tempresult_vec = vdupq_n_u16(0);
    
    unsigned short *p8_short = result;
    for (; dim8>0; dim8--) //每次同时访问8个数组元素
    {
        uint16x8_t data1_vec = vld1q_u16(arr1); //依次取4个元素存入寄存器vec
        uint16x8_t data2_vec = vld1q_u16(arr2);
        uint16x8_t data3_vec = vld1q_u16(arr3);
        
        mul1_vec = vmulq_u16(data1_vec, data3_vec);//ri = ai * bi 
        
//                vst1q_s16(p8_short, mul1_vec);//fortest----------

        tempsub_vec = vsubq_u16(a1_vec, data3_vec);//ri = ai - bi 
        
        mul2_vec = vmulq_u16(tempsub_vec, data2_vec);
        
        tempadd_vec = vaddq_u16(mul1_vec, mul2_vec);
        
//        vst1q_u16(p8_short, tempadd_vec);//fortest----------
        
        tempresult_vec = vshrq_n_u16(tempadd_vec, 12);        
        
        vst1q_u16(p8_short, tempresult_vec);
        
        arr1 += 8;
        arr2 += 8;
        arr3 += 8;
        p8_short += 8;
    }

    return 0;
}
#define data_len 10240000   //1024*768=786432
int my_alg_test(void)
{

    printf("my_alg_test  ! \n");
    
    short *in_data1 = (short*)malloc(data_len*sizeof(short));
    short *in_data2 = (short*)malloc(data_len*sizeof(short));
    short *in_data3 = (short*)malloc(data_len*sizeof(short));
    short *out_data1 = (short*)malloc(data_len*sizeof(short));
   
     struct timeval start,end;
     int i;
     short sum_result;
     for(i=0;i<data_len;i++)
     {
         in_data1[i] = (unsigned short)rand()%(128) + 0 ;
         in_data2[i] = (unsigned short)rand()%(4) + 0 ;
         in_data3[i] = (unsigned short)rand()%(128) + 0 ;
         out_data1[i] = 0;
     }
     
/*     
     for(i=0;i<data_len;i++)
     {
        printf("%d ", in_data1[i]);   
     }
     printf("\r\n ");  
     for(i=0;i<data_len;i++)
     {
        printf("%d ", in_data2[i]);   
     }
     printf("\r\n ");  
     for(i=0;i<data_len;i++)
     {
        printf("%d ", in_data3[i]);   
     }
     printf("\r\n ");  
        */
        
     printf("-------- algtest_c test -------- \r\n");   
     gettimeofday(&start,NULL);
     algtest_c(in_data1, in_data2, in_data3, out_data1, data_len);
     gettimeofday(&end,NULL);  
     printf("algtest_c use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
//     for(i=0;i<data_len;i++)
     {
//        printf("%d ", out_data1[i]);   
     }
     printf("\r\n ");   
 
 
      for(i=0;i<data_len;i++)
     {
         out_data1[i] = 0;
     }
     printf("-------- algtest_neon test -------- \r\n");
     sum_result = 0;
     gettimeofday(&start,NULL);
    algtest_neon((unsigned short *)in_data1, (unsigned short *)in_data2, (unsigned short *)in_data3, (unsigned short *)out_data1, data_len);
     gettimeofday(&end,NULL);  
     printf("algtest_neon use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
//     for(i=0;i<data_len;i++)
     {
//        printf("%d ", out_data1[i]);   
     }
     printf("\r\n ");  
     
    free(in_data1);
    free(in_data2);
    free(in_data3);
    free(out_data1);
    
     return 0;
}

测试结果:

my_alg_test  ! 
-------- algtest_c test -------- 
algtest_c use time 187637us

 -------- algtest_neon test -------- 
algtest_neon use time 155556us

       对于比较复杂的数学运算,一般的O3编译性能与手工向量化的代码比起来,数据量越大,手工向量化的效果越好,数据量越小,则体现不出来优势。同时要特别注意数据大小是否会溢出,防止出现向量化后计算错误。

 类似资料: