以Android为例, 并计算float数组的和
标准 C 代码实现
// 标准 C 代码实现
static float calc_c(const float* data, int size)
{
float sum = 0.f;
for (int i = 0; i < size; ++i) {
sum += data[i];
}
return sum;
}
运用 ARM NEON 优化
// 运用 ARM NEON 优化
#include <arm_neon.h>
static float calc_neon(const float* data, int size)
{
float sum = 0.f;
float32x4_t sum_vec = vdupq_n_f32(0);
for (int i = 0; i < size / 4; ++i) {
float32x4_t tmp_vec = vld1q_f32 (data + 4*i);
sum_vec = vaddq_f32(sum_vec, tmp_vec);
}
sum += vgetq_lane_f32(sum_vec, 0);
sum += vgetq_lane_f32(sum_vec, 1);
sum += vgetq_lane_f32(sum_vec, 2);
sum += vgetq_lane_f32(sum_vec, 3);
int odd = size & 3;
if(odd) {
for(int i = size - odd; i < size; ++i) {
sum += data[i];
}
}
return sum;
}
运用 ARM NEON 优化
// 运用 Ne10 优化
#include <NE10.h>
#define ALIGH_UNIT 4
static float calc_ne10(const float* data, int size)
{
float sum = 0.f;
float sum_vec[ALIGH_UNIT] = {0};
for (int i = 0; i < size / ALIGH_UNIT; ++i) {
ne10_add_float_neon (sum_vec, sum_vec, (float*)data+ALIGH_UNIT*i, ALIGH_UNIT);
}
for (int i = 0; i < ALIGH_UNIT; ++i) {
sum += sum_vec[i];
}
int odd = size & (ALIGH_UNIT-1);
if(odd) {
for(int i = size - odd; i < size; ++i) {
sum += data[i];
}
}
return sum;
}
运用 ARM NEON 优化
// 主程序 Main
#include <stdlib.h>
#include <time.h>
#include <android/log.h>
#define LOG_TAG "Neon/Pref"
#define LOGD(...) __android_log_print(ANDROID_LOG_DEBUG,LOG_TAG,__VA_ARGS__)
#define ARRAY_SIZE 5000
#define ELAPSE_BEGIN(a) struct timeval start##a = {0}; gettimeofday(&start##a, 0);
#define ELAPSE_END(a) struct timeval end##a = {0}; gettimeofday(&end##a, 0);
#define ELAPSE_COUNT(a) (1000000 * (end##a.tv_sec - start##a.tv_sec) + (end##a.tv_usec - start##a.tv_usec))
#define DO_ELAPSE(fn,...) \
{\
ELAPSE_BEGIN(_##fn);\
float sum = fn(__VA_ARGS__);\
ELAPSE_END(_##fn);\
LOGD( #fn " : %d, Result: %f", (int)ELAPSE_COUNT(_##fn), sum);\
}
int main(int argc, char** argv) {
float data[ARRAY_SIZE] = {0};
for (int i = 0; i < ARRAY_SIZE; ++i) {
data[i] = rand() % 5;
}
DO_ELAPSE(calc_c , data, ARRAY_SIZE);
DO_ELAPSE(calc_neon, data, ARRAY_SIZE);
DO_ELAPSE(calc_ne10, data, ARRAY_SIZE);
}
[资料文档]
ARM NEON: http://gcc.gnu.org/onlinedocs/gcc/ARM-NEON-Intrinsics.html
NE10 Manual: http://blogs.arm.com/software-enablement/874-ne10-library-getting-started/
[参考资料]
http://hilbert-space.de/?p=22
http://www.crickettechnology.com/blog/?p=691