问题：

AVX 256位代码的性能略差于等效的128位SSSE3代码

常明亮

2023-03-14

我正在尝试编写非常有效的海明距离代码。受到Wojciech Muva极其巧妙的SSE3 popcount实现的启发，我编码了一个AVX2等效的解决方案，这次使用了256位寄存器。l期望至少有30%-40%的改进，基于所涉及的操作的双倍并行度，然而令我惊讶的是，AVX2代码慢了一点点（大约2%）！

两个64字节块的展开SSE3汉明距离：

INT32 SSE_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) {

   __m128i paccum  = _mm_setzero_si128();

   __m128i a       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA));
   __m128i b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB));
   __m128i err     = _mm_xor_si128   (a, b);
   __m128i lo      = _mm_and_si128   (err, low_mask);
   __m128i hi      = _mm_srli_epi16  (err, 4);
           hi      = _mm_and_si128   (hi, low_mask);
   __m128i popcnt1 = _mm_shuffle_epi8(lookup, lo);
   __m128i popcnt2 = _mm_shuffle_epi8(lookup, hi);
           paccum  = _mm_add_epi8(paccum, popcnt1);
           paccum  = _mm_add_epi8(paccum, popcnt2);

           a       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 4));
           b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 4));
           err     = _mm_xor_si128   (a, b);
           lo      = _mm_and_si128   (err, low_mask);
           hi      = _mm_srli_epi16  (err, 4);
           hi      = _mm_and_si128   (hi, low_mask);
           popcnt1 = _mm_shuffle_epi8(lookup, lo);
           popcnt2 = _mm_shuffle_epi8(lookup, hi);
           paccum  = _mm_add_epi8(paccum, popcnt1);
           paccum  = _mm_add_epi8(paccum, popcnt2);

           a       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 8));
           b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 8));
           err     = _mm_xor_si128   (a, b);
           lo      = _mm_and_si128   (err, low_mask);
           hi      = _mm_srli_epi16  (err, 4);
           hi      = _mm_and_si128   (hi, low_mask);
           popcnt1 = _mm_shuffle_epi8(lookup, lo);
           popcnt2 = _mm_shuffle_epi8(lookup, hi);
           paccum  = _mm_add_epi8(paccum, popcnt1);
           paccum  = _mm_add_epi8(paccum, popcnt2);

           a       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pA + 12));
           b       = _mm_loadu_si128 (reinterpret_cast<const __m128i*>(pB + 12));
           err     = _mm_xor_si128   (a, b);
           lo      = _mm_and_si128   (err, low_mask);
           hi      = _mm_srli_epi16  (err, 4);
           hi      = _mm_and_si128   (hi, low_mask);
           popcnt1 = _mm_shuffle_epi8(lookup, lo);
           popcnt2 = _mm_shuffle_epi8(lookup, hi);
           paccum  = _mm_add_epi8(paccum, popcnt1);
           paccum  = _mm_add_epi8(paccum, popcnt2);

           paccum  = _mm_sad_epu8(paccum, _mm_setzero_si128());
   UINT64  result =  paccum.m128i_u64[0] + paccum.m128i_u64[1];
   return (INT32)result;
}

使用AVX 256位寄存器的未展开等效版本：

INT32 AVX_PopCount(const UINT32* __restrict pA, const UINT32* __restrict pB) {
   __m256i paccum =  _mm256_setzero_si256();

   __m256i a       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA));
   __m256i b       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB));
   __m256i err     = _mm256_xor_si256   (a, b);
   __m256i lo      = _mm256_and_si256   (err, low_mask256);
   __m256i hi      = _mm256_srli_epi16  (err, 4);
           hi      = _mm256_and_si256   (hi, low_mask256);
   __m256i popcnt1 = _mm256_shuffle_epi8(lookup256, lo);
   __m256i popcnt2 = _mm256_shuffle_epi8(lookup256, hi);
           paccum  = _mm256_add_epi8(paccum, popcnt1);
           paccum  = _mm256_add_epi8(paccum, popcnt2);

           a       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pA + 8));
           b       = _mm256_loadu_si256 (reinterpret_cast<const __m256i*>(pB + 8));
           err     = _mm256_xor_si256   (a, b);
           lo      = _mm256_and_si256   (err, low_mask256);
           hi      = _mm256_srli_epi16  (err, 4);
           hi      = _mm256_and_si256   (hi, low_mask256);
           popcnt1 = _mm256_shuffle_epi8(lookup256, lo);
           popcnt2 = _mm256_shuffle_epi8(lookup256, hi);
           paccum  = _mm256_add_epi8(paccum, popcnt1);
           paccum  = _mm256_add_epi8(paccum, popcnt2);

           paccum  = _mm256_sad_epu8(paccum, _mm256_setzero_si256());
           UINT64  result =  paccum.m256i_i64[0] + paccum.m256i_u64[1] + paccum.m256i_i64[2] + paccum.m256i_i64[3];
   return (INT32)result;
}

我已经验证了编译器发出的输出汇编代码，它看起来很好，预期将内部指令直接转换为机器指令。我注意到的唯一一件事是，在AVX2版本上，4个四个字的填充计数的最后一行是累加的，它生成的代码比SSE3版本（SSE3版本只需要累加2个四个字就可以获得填充计数）更复杂，但是我仍然希望有更快的吞吐量。

为四字累加生成AVX2代码

vextractf128 xmm0, ymm2, 1
psrldq  xmm0, 8
movd    ecx, xmm2
movd    eax, xmm0
vextractf128 xmm0, ymm2, 1
psrldq  xmm2, 8
add eax, ecx
movd    ecx, xmm0
add eax, ecx
movd    ecx, xmm2
add eax, ecx

为四字累加生成SSE3代码

movd    ecx, xmm2
psrldq  xmm2, 8
movd    eax, xmm2
add eax, ecx

我的测试程序每个例程调用100万次，输入值不同，但重用了两个静态缓冲区来保存pa和pb参数的数据。在我对CPU架构的有限理解中，这种局部性（反复重用相同的内存缓冲区）应该能很好地预热CPU缓存，而不会被内存带宽问题所束缚，但除了可能的内存带宽之外，我不能理解为什么没有性能改进。

int _tmain(int argc, _TCHAR* argv[]) {

   lookup = _mm_setr_epi8(
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
    );
   low_mask = _mm_set1_epi8(0xf);

   lookup256 = _mm256_setr_epi8(
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4,
        /* 0 */ 0, /* 1 */ 1, /* 2 */ 1, /* 3 */ 2,
        /* 4 */ 1, /* 5 */ 2, /* 6 */ 2, /* 7 */ 3,
        /* 8 */ 1, /* 9 */ 2, /* a */ 2, /* b */ 3,
        /* c */ 2, /* d */ 3, /* e */ 3, /* f */ 4
    );

   low_mask256 = _mm256_set1_epi8(0xf);


   std::default_random_engine generator;
   generator.seed(37);
   std::uniform_int_distribution<UINT32> distribution(0, ULONG_MAX);
   auto dice = std::bind( distribution, generator);


   UINT32 a[16];
   UINT32 b[16];

   int count;
   count = 0;
   {
      cout << "AVX PopCount\r\n";
      boost::timer::auto_cpu_timer t;
      for( int i = 0; i < 1000000; i++ ) {
         for( int j = 0; j < 16; j++ ) {
            a[j] = dice();
            b[j] = dice();
         }
         count+= AVX_PopCount(a, b);
      }
   }

   cout << count << "\r\n";


   std::default_random_engine generator2;
   generator2.seed(37);
   std::uniform_int_distribution<UINT32> distribution2(0, ULONG_MAX);
   auto dice2 = std::bind( distribution2, generator2);


   count = 0;
   {
      cout << "SSE PopCount\r\n";
      boost::timer::auto_cpu_timer t;
      for( int i = 0; i < 1000000; i++ ) {
         for( int j = 0; j < 16; j++ ) {
            a[j] = dice2();
            b[j] = dice2();
         }
         count+= SSE_PopCount(a, b);
      }
   }
   cout << count << "\r\n";

   getch();
   return 0;
}

测试机器是Intel Corei7 4790，我使用的是Visual Studio 2012 Pro。

共有1个答案

仰经武

2023-03-14

除了注释中的小问题（编译/arch:avx)外，您的主要问题是在每次迭代时生成随机输入数组。这是您的瓶颈，因此您的测试无法有效地评估您的方法。注意-我没有使用boost，但gettickcount可用于此目的。考虑一下：

int count;
count = 0;
{
    cout << "AVX PopCount\r\n";
    unsigned int Tick = GetTickCount();
    for (int i = 0; i < 1000000; i++) {
        for (int j = 0; j < 16; j++) {
            a[j] = dice();
            b[j] = dice();
        }
        count += AVX_PopCount(a, b);
    }
    Tick = GetTickCount() - Tick;
    cout << Tick << "\r\n";
}

产生输出：

AVX popcount
2309
256002470

int count;
count = 0;
{
    cout << "Just making arrays...\r\n";
    unsigned int Tick = GetTickCount();
    for (int i = 0; i < 1000000; i++) {
        for (int j = 0; j < 16; j++) {
            a[j] = dice();
            b[j] = dice();
        }           
    }
    Tick = GetTickCount() - Tick;
    cout << Tick << "\r\n";
}

这样怎么样。这并不奇怪，真的，因为你要生成32个随机数，这可能是相当昂贵的，然后只执行一些相当快的整数数学和洗牌。

所以...

现在让我们再增加一个因子100的迭代，并使随机生成器脱离紧密循环。在禁用优化的情况下编译这里将按预期运行您的代码，并且不会丢弃“无用的”迭代--想必我们这里关心的代码已经（手动）优化了！

    for (int j = 0; j < 16; j++) {
        a[j] = dice();
        b[j] = dice();
    }

    int count;
    count = 0;
    {
        cout << "AVX PopCount\r\n";
        unsigned int Tick = GetTickCount();
        for (int i = 0; i < 100000000; i++) {           
            count += AVX_PopCount(a, b);
        }
        Tick = GetTickCount() - Tick;
        cout << Tick << "\r\n";
    }

    cout << count << "\r\n";

    count = 0;
    {
        cout << "SSE PopCount\r\n";
        unsigned int Tick = GetTickCount();
        for (int i = 0; i < 100000000; i++) {
            count += SSE_PopCount(a, b);
        }
        Tick = GetTickCount() - Tick;
        cout << Tick << "\r\n";
    }
    cout << count << "\r\n";

AVX popcount
3744
730196224
SSE popcount
5616
730196224

所以恭喜你--你可以拍拍自己的背了，你的AVX例程确实比SSE例程快了大约三分之一（这里在Haswell i7上测试过）。教训是要确保你实际上是在分析你认为你在分析的东西！

类似资料：

JIT编译代码位于何处？

问题内容：所以我有用Java编写的这种方法：并假设我的应用程序多次调用此方法。在Java虚拟机上为该方法运行编译后的代码时，JVM将首先解释该方法。然后经过一段时间，如果我理解正确，它将决定将其编译为机器语言。这一点，会被内存中的机器代码覆盖吗？如果覆盖，大小差异问题将如何解决？如果将其写入内存中的其他位置，加载到内存中的字节码是否会释放？而且，如果字节代码和jit编译代码都在内存中，那
arcgis android之定位功能的示例代码

本文向大家介绍arcgis android之定位功能的示例代码，包括了arcgis android之定位功能的示例代码的使用技巧和注意事项，需要的朋友参考一下关于定位的功能，开发，很早之前就有做过百度的定位功能。起初是有想法把百度的Loc V3.2的定位SDK整合进来用。但是终归是想法，但是知道昨天，我问技术群，里面的一位朋友就说起了百度地位SDK整合进来的实现方法。顿时，我就思考了一会，随后就
Swift 4.2代码等效于SAP的Leonardo API[副本]

使用场景文本识别API连接SAP Leonardo沙箱服务器时出现HTTP 400错误我在响应对象中得到以下详细信息 {url:https://sandbox.api.sap.com/ml/scenetextrecognition/scene-text-recognition/scene-text-recognition}{Status code：400,Headers{\n Connectio
位于Java类文件中的lambda的代码在哪里？

问题内容：我有这个Java源文件：我编译它，并且按预期方式工作。这是的输出，我找不到lambda的位置。每当调用lambda时，告诉jvm使用输入来计算表达式的字节码在哪里？问题答案：如果要查看lambda主体的代码，则应调用以查看私有方法：
内置源代码位置

问题内容：我可以在Go的源代码中的哪里找到它们的实现。事实证明，“代码搜索”功能对于这种语言的主要功能几乎是无用的，而且我没有确定要搜索C函数，Go函数还是什么的好方法。将来我又如何在不求助于此的情况下解决这类问题？（即：教我钓鱼）编辑 PS我已经找到了http://golang.org/pkg/builtin/#make，但是，与其余的go包不同的是，它不包含指向源的链接，大概是因为它在
php代码检查代理ip的有效性

本文向大家介绍php代码检查代理ip的有效性，包括了php代码检查代理ip的有效性的使用技巧和注意事项，需要的朋友参考一下本文实例为大家分享了检查代理ip有效性php代码，稳定性，如错误率和查询用时以上就是本文的全部内容，希望对大家的学习有所帮助，也希望大家多多支持呐喊教程。

AVX 256位代码的性能略差于等效的128位SSSE3代码

共有1个答案

相关问答

相关文章

相关阅读

相关工具

相关文档