1、NE10的编译工具是cmake,需要安装cmake
(1)在官网 https://cmake.org/download/ 下载最新版安装包
(2)解压压缩包,并进行安装
./bootstrap
make
make install
(3)安装完成后,可以运行cmake -version确认是否安装成功
(4) 另一种安装方式是直接运行apt-get install cmake进行安装
2、下载Ne10然后解压
(1)如果编译器是通过apt-get install gcc-arm-linux-gnueabihf下载,则可以直接编译
(2)如果编译器为自己安装,如安装的是zynq的交叉编译器arm-linux-gnueabi-gcc,则需更改GNUlinux_config.cmake文件中的编译器名字,将arm-linux-gnueabihf-gcc改为arm-linux-gnueabi-gcc、g++、as,而且我的编译器是安装在别的路径。所以一定要设置环境变量,在/etc/bash.bashrc中添加如下,并运行source /etc/bash.bashrc
export CROSS_COMPILE=arm-linux-gnueabi-
export PATH=/home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin:$PATH
3、编译
(1)执行
cd $NE10_PATH
mkdir build && cd build
export NE10_LINUX_TARGET_ARCH=armv7 # Can also be "aarch64"
cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ..
(2) 确保以下信息的正确性
root@ubuntu:/home/test/neon/Ne10-master/build# cmake -DCMAKE_TOOLCHAIN_FILE=../GNUlinux_config.cmake ..
-- The C compiler identification is GNU 4.9.4
-- The CXX compiler identification is GNU 4.9.4
-- The ASM compiler identification is GNU
-- Found assembler: /home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin/arm-linux-gnueabi-as
-- Check for working C compiler: /home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin/arm-linux-gnueabi-gcc
//一定要确认此路径为交叉编译器的位置,开始我并没有source /etc/bash.bashrc,一直出现的是-- Check for working C compiler: /usr/bin/gcc,导致一直出现fatal error:arm_neon.h no such file or directory
-- Check for working C compiler: /home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin/arm-linux-gnueabi-gcc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detecting C compile features
-- Detecting C compile features - done
-- Check for working CXX compiler: /home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin/arm-linux-gnueabi-g++
-- Check for working CXX compiler: /home/gcc-linaro-4.9-2016.02-x86_64_arm-linux-gnueabi/bin/arm-linux-gnueabi-g++ -- works
-- Detecting CXX compiler ABI info
-- Detecting CXX compiler ABI info - done
-- Detecting CXX compile features
-- Detecting CXX compile features - done
-- Target architecture: armv7
-- Building type: RELEASE
-- Configuring done
-- Generating done
-- Build files have been written to: /home/test/neon/Ne10-master/build
(3) 编译 make,成功后/build/modules/下出现libNE10.a,/build/samples/出现NE10_test_static
(4)如果出现如下问题:
1)gnu/stubs.h:7:29: fatal error: gnu/stubs-soft.h: No such file or directory
需要更改编译选项:将GNULINUX_PLATFORM下的set(CMAKE_C_FLAGS)中的-mfloat-abi=softfp改为-mfloat-abi=hard
(5)开发板上执行NE10_test_static ,出现错误,未解决
./NE10_test_static: error while loading shared libraries: libstdc++.so.6: cannot open shared object file: No such file or directory
4、简单应用
(1)如下是一个简单的数组元素相加例子,sum_array为原始代码,sum_array_neon为用neon的代码,对比了10240000点数组元素相加在不同算法下运行时间
#include <stdio.h>
#include <arm_neon.h>
#include <sys/time.h>
float sum_array(float *arr, int len)
{
if(NULL == arr || len < 1)
{
printf("input error\n");
return 0;
}
float sum;
int i;
for(i=0; i<len; ++i)
{
sum += *arr++;
}
return sum;
}
float sum_array_neon(float *arr, int len)
{
if(NULL == arr || len < 1)
{
printf("input error\n");
return 0;
}
int dim4 = len >> 2; // 数组长度除4整数
int left4 = len & 3; // 数组长度除4余数
float32x4_t sum_vec = vdupq_n_f32(0.0);//定义用于暂存累加结果的寄存器且初始化为0
for (; dim4>0; dim4--, arr+=4) //每次同时访问4个数组元素
{
float32x4_t data_vec = vld1q_f32(arr); //依次取4个元素存入寄存器vec
sum_vec = vaddq_f32(sum_vec, data_vec);//ri = ai + bi 计算两组寄存器对应元素之和并存放到相应结果
}
float sum = vgetq_lane_f32(sum_vec, 0)+vgetq_lane_f32(sum_vec, 1)+vgetq_lane_f32(sum_vec, 2)+vgetq_lane_f32(sum_vec, 3);//将累加结果寄存器中的所有元素相加得到最终累加值
for (; left4>0; left4--, arr++)
sum += (*arr) ; //对于剩下的少于4的数字,依次计算累加即可
return sum;
}
#define size 10240000
float test[size];
void main()
{
struct timeval start,end;
int i;
float sum_result;
for(i=0;i<size;i++)
test[i] = 1;
gettimeofday(&start,NULL);
sum_result = sum_array_neon(test,size);
gettimeofday(&end,NULL);
printf("neon use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
gettimeofday(&start,NULL);
sum_result = sum_array(test,size);
gettimeofday(&end,NULL);
printf("no neon use time %dus\n",1000000*(end.tv_sec - start.tv_sec) + (end.tv_usec - start.tv_usec));
}
(2)编译,这就用到了上述编译出来的libNE10.a
arm-linux-gnueabi-gcc neon_test.c -mfpu=neon -L. -lNE10
(3)放到开发板中运行,可以看到neon的执行速度更快
zynq> ./a.out
neon use time 254909us
no neon use time 280977us