问题：

Intel Xeon Phi上的快速popcount

从开济

2023-03-14

我正在Intel Xeon®Phi®上实现一个超高速popcount，因为它是各种生物信息学软件的性能热点。

我已经实现了五段代码，

#if defined(__MIC__)
#include <zmmintrin.h>
__attribute__((align(64))) static const uint32_t POPCOUNT_4bit[16] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
__attribute__((align(64))) static const uint32_t MASK_4bit[16] = {0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF, 0xF};
inline uint64_t vpu_popcount1(uint64_t* buf, size_t n)  {
    register size_t result = 0;
    size_t i;
    register const __m512i popcnt = _mm512_load_epi32((void*)POPCOUNT_4bit);
    register const __m512i mask = _mm512_load_epi32((void*)MASK_4bit);
    register __m512i total;
    register __m512i shuf;

#pragma unroll(8)
    for (i = 0; i < n; i+=8) {
        shuf = _mm512_load_epi32(&buf[i]);
        _mm_prefetch((const char *)&buf[i+256], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
        total = _mm512_setzero_epi32();

        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(shuf, mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 4),  mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 8),  mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 12), mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 16), mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 20), mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 24), mask), popcnt), total);
        total = _mm512_add_epi32(_mm512_permutevar_epi32(_mm512_and_epi32(_mm512_srli_epi32(shuf, 28), mask), popcnt), total);

        /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
           is not implementated as a single instruction in VPUv1, thus
           emulated by multiple instructions*/
        result += _mm512_reduce_add_epi32(total);
    }

    return result;
}

__attribute__((align(64))) static const unsigned magic[] = {\
        0x55555555, 0x55555555, 0x55555555, 0x55555555,\
        0x55555555, 0x55555555, 0x55555555, 0x55555555,\
        0x55555555, 0x55555555, 0x55555555, 0x55555555,\
        0x55555555, 0x55555555, 0x55555555, 0x55555555,\
        0x33333333, 0x33333333, 0x33333333, 0x33333333,\
        0x33333333, 0x33333333, 0x33333333, 0x33333333,\
        0x33333333, 0x33333333, 0x33333333, 0x33333333,\
        0x33333333, 0x33333333, 0x33333333, 0x33333333,\
        0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,\
        0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,\
        0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,\
        0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F,\
        0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,\
        0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,\
        0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,\
        0x00FF00FF, 0x00FF00FF, 0x00FF00FF, 0x00FF00FF,\
        0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,\
        0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,\
        0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,\
        0x0000FFFF, 0x0000FFFF, 0x0000FFFF, 0x0000FFFF,\
            0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,\
            0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,\
            0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF,\
            0x000000FF, 0x000000FF, 0x000000FF, 0x000000FF
    };

inline uint64_t vpu_popcount2(uint64_t* buf, size_t n)  {
    register size_t result = 0;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
    register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
    register __m512i total;
    register __m512i shuf;

#pragma unroll(8)
    for (i = 0; i < n; i+=8) {
        shuf = _mm512_load_epi32(&buf[i]);
        _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
        total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
        total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
        total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
        total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
        total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));

        /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
           is not implementated as a single instruction in VPUv1, thus
           emulated by multiple instructions*/
        result += _mm512_reduce_add_epi32(total);
    }

    return result;
}

inline uint64_t vpu_popcount3(uint64_t* buf, size_t n)  {
    register size_t result = 0;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
    register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
    register __m512i total;
    register __m512i shuf;

#pragma unroll(4)
    for (i = 0; i < n; i+=16) {
        shuf = _mm512_load_epi32(&buf[i]);
        result += _mm_countbits_64(buf[i+8]);
        _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+576], _MM_HINT_T1); // vprefetch1
        result += _mm_countbits_64(buf[i+9]);
        _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+128], _MM_HINT_T0); // vprefetch0
        total = _mm512_sub_epi32(shuf, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf,1)));
        result += _mm_countbits_64(buf[i+10]);
        total = _mm512_add_epi32(_mm512_and_epi32(B1, total), _mm512_and_epi32(B1,_mm512_srli_epi32(total,2)));
        result += _mm_countbits_64(buf[i+11]);
        total = _mm512_and_epi32(B2, _mm512_add_epi32(total, _mm512_srli_epi32(total,4)));
        result += _mm_countbits_64(buf[i+12]);
        total = _mm512_and_epi32(B3, _mm512_add_epi32(total, _mm512_srli_epi32(total,8)));
        result += _mm_countbits_64(buf[i+13]);
        total = _mm512_and_epi32(B4, _mm512_add_epi32(total, _mm512_srli_epi32(total,16)));
        result += _mm_countbits_64(buf[i+14]);

        /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
           is not implementated as a single instruction in VPUv1, thus
           emulated by multiple instructions*/
        result += _mm512_reduce_add_epi32(total);
        result += _mm_countbits_64(buf[i+15]);
    }

    return result;
}

/* Using VPU or SSE's machine intrinsic, CPUs not supporting SIMD 
 * will use compiler's implementation, the speed of which depends */
static inline size_t scalar_popcountu(unsigned *buf, size_t n) {
  register size_t cnt = 0;
  size_t i;
#pragma vector always
#pragma unroll(8)
  for (i = 0; i < n; i++) {
    cnt += _mm_countbits_32(buf[i]);
    _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
  }
  return cnt;
}

static inline size_t scalar_popcountlu(uint64_t *buf, size_t n) {
  register size_t cnt = 0;
  size_t i;
#pragma vector always
#pragma unroll(8)
  for (i = 0; i < n; i++) {
    cnt += _mm_countbits_64(buf[i]);
    _mm_prefetch((const char *)&buf[i+512], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[i+64], _MM_HINT_T0); // vprefetch0
  }
  return cnt;
}
#endif

支持OpenMP的代码摘要可从以下网站下载https://www.dropbox.com/sh/b3sfqps19wa2oi4/iFQ9wQ1NTg

代码是使用“英特尔C/C编译器XE 13”使用以下命令编译的：

icc -debug inline-debug-info -O3 -mmic -fno-alias -ansi-alias -opt-streaming-stores always -ipo popcnt-mmic.cpp -o popcnt-mmic -vec-report=2 -openmp

代码在协处理器（61核）上本机运行，使用导出“122个线程”和“平衡”的线程关联：

export OMP_NUM_THREADS=122;export KMP_AFFINITY=balanced

我正在使用Xeon Phi SE10p、B1 stepping、CentOS6.4在28 MB的垃圾（由rand（）填充）上进行测试，并迭代了10000次，性能如下：

Buffer allocated at: 0x7f456b000000
OpenMP scalar_popcountu       4310169 us; cnt = 28439328
OpenMP scalar_popcountlu      1421139 us; cnt = 28439328
OpenMP vpu_popcount           1489992 us; cnt = 28439328
OpenMP vpu_popcount2          1109530 us; cnt = 28439328
OpenMP vpu_popcount3           951122 us; cnt = 28439328

“scalar\u popcountu”和“scalar\u popcountlu”分别使用“\u mm\u countbits\u 32”和“\u mm\u countbits\u 64”内部函数，它们使用标量“popcnt”指令。设置“#pragma vector always”要求编译器将加载和求和矢量化为16个无符号整数或8个无符号long，尽管popcount本身仍然是一条标量指令。

vpu_popcount1的实现类似于SSSE3 popcount实现http://wm.ite.pl/articles/sse-popcount.html.但是，1）至强Phi不支持整数的打包字节操作（最小值为双字，又名32位）和2）它不实现“打包和绝对差异”指令（如SSSE3中的_mm_sad_epu8），因此缩减添加是由“vpermf32x4”、“vpaddd”和“movslq”四组组合执行的。因此，该实现生成的指令比原始SSSE3版本多得多。

vpu_popcount2的实现类似于SSE2 popcount实现（可以参考“黑客的喜悦”）。该实现生成的指令比vpu_popcount1少，速度快30%左右。然而，繁琐的“减少添加”仍然无法避免。

vpu\u popcount3的实现非常特定于Xeon Phi。使用向量和标量操作的混合，它比vpu\u popcount2快大约15%（在我的实现中，标量操作穿插在向量操作中是很轻松的，可以根据编译器生成的汇编代码重新排列标量操作，但预期的改进就我而言是有限的）。改进基于以下观察：1）Xeon Phi处于顺序调度，2）每个时钟周期可以发出两条标量指令或“1向量1标量”指令。我已经将展开从8减少到4，以避免寄存器文件饱和。

在每个函数中，从内存到L2 8循环和从L2到L1 1循环的显式预取将L1命中率从0.38提高到0.994。

展开确实会将性能提高约15%。这是违反直觉的，因为Xeon Phi是按订单安排的。但unroll使icc编译器能够尽可能多地进行编译时调度。

我们是否有更多的技术来提高性能？

布赖恩·尼克森的两段更快的代码，

OpenMP vpu_popcount2          1110737 us; cnt = 28439328
OpenMP vpu_popcount3           951459 us; cnt = 28439328
OpenMP vpu_popcount3_r         815126 us; cnt = 28439328
OpenMP vpu_popcount5           746852 us; cnt = 28439328

vpu\u popcount3\u修订版：

inline uint64_t vpu_popcount3_revised(uint64_t* buf, size_t n) {
  _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
  _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
  _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
  _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
  register size_t result;
  size_t i;

  register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
  register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
  register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
  register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
  register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
  register __m512i total0;
  register __m512i total1;
  register __m512i shuf0;
  register __m512i shuf1;
  register __m512i result0;
  register __m512i result1;

  result0 = _mm512_setzero_epi32();
  result1 = _mm512_setzero_epi32();

  for (i = 0; i < n; i+=16) {
      shuf0 = _mm512_load_epi32(&buf[i  ]);
      shuf1 = _mm512_load_epi32(&buf[i+8]);
      _mm_prefetch((const char *)&buf[i+128], _MM_HINT_T1); // vprefetch1
      _mm_prefetch((const char *)&buf[i+136], _MM_HINT_T1); // vprefetch1
      _mm_prefetch((const char *)&buf[i+16], _MM_HINT_T0); // vprefetch0
      _mm_prefetch((const char *)&buf[i+24], _MM_HINT_T0); // vprefetch0
      total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));
      total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
      total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2)));
      total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
      total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));
      total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
      total0 = _mm512_and_epi32(B3, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)));
      total1 = _mm512_and_epi32(B3, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8)));
      total0 = _mm512_and_epi32(B4, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));
      total1 = _mm512_and_epi32(B4, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
      result0 = _mm512_add_epi32(result0,total0);
      result1 = _mm512_add_epi32(result1,total1);

  }

  result0 = _mm512_add_epi32(result0,result1);
  result  = _mm512_reduce_add_epi32(result0);

  return result;
}

vpu\u popcount5：

inline uint64_t vpu_popcount5(uint64_t* buf, size_t n)  {
    _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[128], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[136], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[144], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[152], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[160], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[168], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[176], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[184], _MM_HINT_T1); // vprefetch1
    register size_t result;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
    register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
    register const __m512i B6 = _mm512_load_epi32((void*)(magic+80));
    register __m512i total0;
    register __m512i total1;
    register __m512i total2;
    register __m512i total3;
    register __m512i shuf0;
    register __m512i shuf1;
    register __m512i shuf2;
    register __m512i shuf3;
    register __m512i result0;
    register __m512i result1;

    result0 = _mm512_setzero_epi32();
    result1 = _mm512_setzero_epi32();

    for (i = 0; i < n; i+=32) {
            shuf0 = _mm512_load_epi32(&buf[i   ]);
            shuf1 = _mm512_load_epi32(&buf[i+ 8]);
            shuf2 = _mm512_load_epi32(&buf[i+16]);
            shuf3 = _mm512_load_epi32(&buf[i+24]);
            _mm_prefetch((const char *)&buf[i+192], _MM_HINT_T1); // vprefetch1
            _mm_prefetch((const char *)&buf[i+200], _MM_HINT_T1); // vprefetch1
            _mm_prefetch((const char *)&buf[i+208], _MM_HINT_T1); // vprefetch1
            _mm_prefetch((const char *)&buf[i+216], _MM_HINT_T1); // vprefetch1
            _mm_prefetch((const char *)&buf[i+32], _MM_HINT_T0); // vprefetch0
            _mm_prefetch((const char *)&buf[i+40], _MM_HINT_T0); // vprefetch0
            _mm_prefetch((const char *)&buf[i+48], _MM_HINT_T0); // vprefetch0
            _mm_prefetch((const char *)&buf[i+56], _MM_HINT_T0); // vprefetch0
            total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));                        //  max value in nn is 10
            total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
            total2 = _mm512_sub_epi32(shuf2, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf2,1)));
            total3 = _mm512_sub_epi32(shuf3, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf3,1)));
            total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2))); //  max value in nnnn is 0100
            total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
            total2 = _mm512_add_epi32(_mm512_and_epi32(B1, total2), _mm512_and_epi32(B1,_mm512_srli_epi32(total2,2)));
            total3 = _mm512_add_epi32(_mm512_and_epi32(B1, total3), _mm512_and_epi32(B1,_mm512_srli_epi32(total3,2)));
            total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));                      //  max value in 0000nnnn is 00001000
            total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
            total2 = _mm512_and_epi32(B2, _mm512_add_epi32(total2, _mm512_srli_epi32(total2,4)));
            total3 = _mm512_and_epi32(B2, _mm512_add_epi32(total3, _mm512_srli_epi32(total3,4)));
            total0 = _mm512_add_epi32(total0, total1);                                                                 //  max value in 000nnnnn is 00010000
            total1 = _mm512_add_epi32(total2, total3);
            total0 = _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8));                                            //  max value in xxxxxxxx00nnnnnn is 00100000
            total1 = _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8));
            total0 = _mm512_and_epi32(B6, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));                     //  max value in each element is 01000000, i.e. 64
            total1 = _mm512_and_epi32(B6, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
            result0 = _mm512_add_epi32(result0,total0);
            result1 = _mm512_add_epi32(result1,total1);
    }

    result0 = _mm512_add_epi32(result0,result1);
    result  = _mm512_reduce_add_epi32(result0);

    return result;
}

共有2个答案

石博艺

2023-03-14

请尝试以下变体，并报告这是否提高了您的性能？我正在解决我认为在您的编码中不太理想的几个问题：

我认为您的预取距离不太正确。在我看来，当索引实际上是uint64时，您可能一直在考虑字节偏移距离。
我认为没有理由在循环的每次迭代中都进行缩减操作。您可以对16个SIMD元素中的位计数进行部分累加，然后在循环外进行单个减少
我认为，执行标量端popcount指令不如真正充分利用VPU调度优势。专注于出色的VPU计划是最重要的。我也不认为标量popcount指令实际上与向量操作配对；i、 e.我认为它仅在U形管中受支撑

inline uint64_t vpu_popcount3_revised(uint64_t* buf, size_t n) {
    _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
    register size_t result;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B3 = _mm512_load_epi32((void*)(magic+48));
    register const __m512i B4 = _mm512_load_epi32((void*)(magic+64));
    register __m512i total0;
    register __m512i total1;
    register __m512i shuf0;
    register __m512i shuf1;
    register __m512i result0;
    register __m512i result1;

    result0 = _mm512_setzero_epi32();
    result1 = _mm512_setzero_epi32();

    for (i = 0; i < n; i+=16) {
        shuf0 = _mm512_load_epi32(&buf[i  ]);
        shuf1 = _mm512_load_epi32(&buf[i+8]);
        _mm_prefetch((const char *)&buf[i+128], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+136], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+16], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+24], _MM_HINT_T0); // vprefetch0
        total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));
        total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
        total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2)));
        total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
        total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));
        total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
        total0 = _mm512_and_epi32(B3, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8)));
        total1 = _mm512_and_epi32(B3, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8)));
        total0 = _mm512_and_epi32(B4, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));
        total1 = _mm512_and_epi32(B4, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
        result0 = _mm512_add_epi32(result0,total0);
        result1 = _mm512_add_epi32(result1,total1);

    }

    /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
       is not implementated as a single instruction in VPUv1, thus
       emulated by multiple instructions*/

    result0 = _mm512_add_epi32(result0,result1);
    result  = _mm512_reduce_add_epi32(result0);

    return result;
}

宋唯

2023-03-14

自从昨天发帖以来，我已经能够在我自己的卡片上运行您的代码和我的建议。我没有得到与您完全相同的计时，可能是由于硬件的步进，也可能与编译器的版本有关。但这种趋势仍然存在，我的建议似乎实现了大约15%的性能提升。

我得到了一个额外的小的性能提升，在5%到10%之间，并进行了一些调整，如下面的代码所示。请注意，在下面的代码段中，B6将每个元素设置为0x000000FF。在这一点上，我认为该算法可能非常接近GDDR向二级缓存提供的最大可持续带宽。

（补充说明：这一断言的一个证明是，如果我用一个重复十次的for循环来包装popcount5函数体，请注意这是“chunk\u size”的十次快速重复因此，在L2中，有九次测试会非常热——测试的总时间只会增加大约五倍，而不是十倍。我之所以提出这个问题，是因为我认为您的目标是调整位计数逻辑的速度，但可能您希望在其中部署它的应用程序实际上有一个更小和/或更热的工作集。如果是这样，DRAM引入的节流--

inline uint64_t vpu_popcount5(uint64_t* buf, size_t n)  {
    _mm_prefetch((const char *)&buf[0], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[8], _MM_HINT_T0); // vprefetch0
    _mm_prefetch((const char *)&buf[16], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[24], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[32], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[40], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[48], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[56], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[64], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[72], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[80], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[88], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[96], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[104], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[112], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[120], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[128], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[136], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[144], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[152], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[160], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[168], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[176], _MM_HINT_T1); // vprefetch1
    _mm_prefetch((const char *)&buf[184], _MM_HINT_T1); // vprefetch1
    register size_t result;
    size_t i;

    register const __m512i B0 = _mm512_load_epi32((void*)(magic+0));
    register const __m512i B1 = _mm512_load_epi32((void*)(magic+16));
    register const __m512i B2 = _mm512_load_epi32((void*)(magic+32));
    register const __m512i B6 = _mm512_load_epi32((void*)(magic+80));
    register __m512i total0;
    register __m512i total1;
    register __m512i total2;
    register __m512i total3;
    register __m512i shuf0;
    register __m512i shuf1;
    register __m512i shuf2;
    register __m512i shuf3;
    register __m512i result0;
    register __m512i result1;

    result0 = _mm512_setzero_epi32();
    result1 = _mm512_setzero_epi32();

    for (i = 0; i < n; i+=32) {
        shuf0 = _mm512_load_epi32(&buf[i   ]);
        shuf1 = _mm512_load_epi32(&buf[i+ 8]);
        shuf2 = _mm512_load_epi32(&buf[i+16]);
        shuf3 = _mm512_load_epi32(&buf[i+24]);
        _mm_prefetch((const char *)&buf[i+192], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+200], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+208], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+216], _MM_HINT_T1); // vprefetch1
        _mm_prefetch((const char *)&buf[i+32], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+40], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+48], _MM_HINT_T0); // vprefetch0
        _mm_prefetch((const char *)&buf[i+56], _MM_HINT_T0); // vprefetch0
        total0 = _mm512_sub_epi32(shuf0, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf0,1)));                        //  max value in nn is 10
        total1 = _mm512_sub_epi32(shuf1, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf1,1)));
        total2 = _mm512_sub_epi32(shuf2, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf2,1)));
        total3 = _mm512_sub_epi32(shuf3, _mm512_and_epi32(B0, _mm512_srli_epi32(shuf3,1)));
        total0 = _mm512_add_epi32(_mm512_and_epi32(B1, total0), _mm512_and_epi32(B1,_mm512_srli_epi32(total0,2))); //  max value in nnnn is 0100
        total1 = _mm512_add_epi32(_mm512_and_epi32(B1, total1), _mm512_and_epi32(B1,_mm512_srli_epi32(total1,2)));
        total2 = _mm512_add_epi32(_mm512_and_epi32(B1, total2), _mm512_and_epi32(B1,_mm512_srli_epi32(total2,2)));
        total3 = _mm512_add_epi32(_mm512_and_epi32(B1, total3), _mm512_and_epi32(B1,_mm512_srli_epi32(total3,2)));
        total0 = _mm512_and_epi32(B2, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,4)));                      //  max value in 0000nnnn is 00001000
        total1 = _mm512_and_epi32(B2, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,4)));
        total2 = _mm512_and_epi32(B2, _mm512_add_epi32(total2, _mm512_srli_epi32(total2,4)));
        total3 = _mm512_and_epi32(B2, _mm512_add_epi32(total3, _mm512_srli_epi32(total3,4)));
        total0 = _mm512_add_epi32(total0, total1);                                                                 //  max value in 000nnnnn is 00010000
        total1 = _mm512_add_epi32(total2, total3);
        total0 = _mm512_add_epi32(total0, _mm512_srli_epi32(total0,8));                                            //  max value in xxxxxxxx00nnnnnn is 00100000
        total1 = _mm512_add_epi32(total1, _mm512_srli_epi32(total1,8));
        total0 = _mm512_and_epi32(B6, _mm512_add_epi32(total0, _mm512_srli_epi32(total0,16)));                     //  max value in each element is 01000000, i.e. 64
        total1 = _mm512_and_epi32(B6, _mm512_add_epi32(total1, _mm512_srli_epi32(total1,16)));
        result0 = _mm512_add_epi32(result0,total0);
        result1 = _mm512_add_epi32(result1,total1);

        /* Reduce add, which is analogous to SSSE3's PSADBW instruction,
           is not implementated as a single instruction in VPUv1, thus
           emulated by multiple instructions*/
    }

    result0 = _mm512_add_epi32(result0,result1);
    result  = _mm512_reduce_add_epi32(result0);

    return result;
}

类似资料：

快速上手

Hello World 如何调试如何新增一个Controller 如何使用models/dao 如何使用models/service
快速上手

介绍通过本章节你可以了解到 Vant 的安装方法和基本使用姿势。安装通过 npm 安装在现有项目中使用 Vant 时，可以通过 npm 或 yarn 进行安装： # Vue 2 项目，安装 Vant 2.x 版本： npm i vant -S # Vue 3 项目，安装 Vant 3.x 版本： npm i vant@next -S 通过 CDN 安装使用 Vant 最简单的方法是直
快速上手

安装 npm i feart --save-dev # or yarn add feart 引入组件 No.1 : 使用 babel-plugin-import (推荐) # 安装 babel-plugin-import 插件 npm i babel-plugin-import --save-dev // 在 .babelrc 或 babel.config.js 中添加插件配置 {
快速上手

本节课程提供一个使用 Spark 的快速介绍，首先我们使用 Spark 的交互式 shell(用 Python 或 Scala) 介绍它的 API。当演示如何在 Java, Scala 和 Python 写独立的程序时，看编程指南里完整的参考。依照这个指南，首先从 Spark 网站下载一个 Spark 发行包。因为我们不会使用 HDFS，你可以下载任何 Hadoop 版本的包。 Spark Sh
快速上手

快速上手本节将介绍如何在项目中使用 Element。使用 vue-cli@3 我们为新版的 vue-cli 准备了相应的 Element 插件，你可以用它们快速地搭建一个基于 Element 的项目。使用 Starter Kit 我们提供了通用的项目模板，你可以直接使用。对于 Laravel 用户，我们也准备了相应的模板，同样可以直接下载使用。如果不希望使用我们提供的模板，请继续阅读。引
快速上手

新手教程项目：cube-application-guide 遇到问题，先移步 QA 使用 nuxt，请参考示例仓库脚手架 vue-cli >= 3 如果你正在使用新版本的 Vue CLI vue-cli@3，那么推荐你直接使用 vue-cli-plugin-cube-ui 插件。在你初始化完项目后直接执行 vue add cube-ui 即可。在执行的时候，会询问一些配置项，这个和老版本的模
快速上手

通过 npm 安装 dva-cli 并确保版本是 0.9.1 或以上。 $ npm install dva-cli -g $ dva -v dva-cli version 0.9.1 创建新应用安装完 dva-cli 之后，就可以在命令行里访问到 dva 命令（不能访问？）。现在，你可以通过 dva new 创建新应用。 $ dva new dva-quickstart 这会创建 dva-q
快速上手

介绍如何快速搭建多云环境，管理多云资源。文档版本本文档对应产品版本：云联壹云 3.8 版本。读者对象本文档用于帮助用户快速上手云联壹云平台，了解如何快速管理多云环境、实现多云资源的管理等。本文档主要适用于以下读者：云联壹云平台用户部署运维工程师技术支持工程师对云联壹云有研究兴趣的相关人员平台总览该章节用于介绍平台的主要功能。快速配置该章节用于帮助用户快速配置

Intel Xeon Phi上的快速popcount

共有2个答案

相关问答

相关文章

相关阅读

相关工具

相关文档