向量与 SIMD 的点积

Question

我正在尝试使用 SIMD 指令来加速我的 C 代码中的点积计算。但是，我的函数的运行次大致相等。如果有人能解释为什么以及如何加速计算，那就太好了。

具体来说，我正在尝试计算两个包含大约 10,000 个元素的数组的点积。我的常规 C 函数如下：

 float my_dotProd( float const * const x, float const * const y, size_t const N ){
   // N is the number of elements in the arrays
   size_t i;
   float out=0;

   for( i=0; i < N; ++i ){
     out += x[i] * y[i];
   }

   return out;
 }

我使用 AVX SIMD 命令的函数如下：

 void my_malloc( size_t nBytes, void ** ptrPtr ){
   int boundary = 32;
   posix_memalign( ptrPtr, boundary, nBytes );
 }

 float cimpl_sum_m128( __m128 x ){
   float out;
   __m128 sum = x;
   sum = _mm_hadd_ps( sum, sum );
   sum = _mm_hadd_ps( sum, sum );
   out = _mm_cvtss_f32( sum );
   return out;
 }

 float my_sum_m256( __m256 x ){
   float out1, out2;
   __m128 hi = _mm256_extractf128_ps(x, 1);
   __m128 lo = _mm256_extractf128_ps(x, 0);
   out1 = cimpl_sum_m128( hi );
   out2 = cimpl_sum_m128( lo );
   return out1 + out2;
 }

 float my_dotProd( float const * const x, float const * const y, size_t const N ){
   // N is the number of elements in the arrays
   size_t i=0;
   float out=0;
   float *tmp;

   __m256 summed, *l, *r;

   if( N > 7 ){
     my_malloc( sizeof(float) * 8, (void**) &tmp );
     summed = _mm256_set1_ps(0.0f);
     l = (__m256*) x;
     r = (__m256*) y;

     for( i=0; i < N-7; i+=8, ++l, ++r ){
       summed = _mm256_add_ps( summed, _mm256_mul_ps( *l, *r ) );
     }
     _mm256_store_ps( tmp, summed );

     out += my_sum_m256( summed );
     free( tmp );
   }

   for( ; i < N; ++i ){
     out += x[i] * y[i];
   }

   return out;
 }

我的测试程序是：

 int test_dotProd(){
   float *x, *y;
   size_t i, N;
   float answer, result;
   float err;

   N = 100000;  // Fails

   my_malloc( sizeof(float) * N, (void**) &x );
   my_malloc( sizeof(float) * N, (void**) &y );

   answer = 0;
   for( i=0; i<N; ++i ){
     x[i]=i; y[i]=i;
     answer += (float)i * (float)i;
   }

   result = my_dotProd( x, y, N );

   err = fabs( result - answer ) / answer;

   free( x );
   free( y );
   return err < 5e-7;
 }

我正在使用时钟来测量运行时间，如下所示：

 timeStart = clock();
 testStatus = test_dotProd();
 timeTaken = (int)( clock() - timeStart );

我意识到 my_sum_m256 操作可以提高效率，但我认为这对运行时间的影响应该很小。我猜想 SIMD 代码的速度大约是原来的八倍。有什么想法吗？

谢谢大家的帮助:)

Answer 1

不幸的是，点积算法是内存限制算法（计算次数小于所需内存吞吐量）。因此，即使使用 AVX（或 AVX2）也无法有效地执行它。我也在similar way中实现了这个算法，但是我只达到了60%的性能提升。

Answer 2

首先：你不应该假设你可以比编译器优化得更好。

是的，您现在正在 "optimized" 代码中使用 AVX 指令。但是除了普通矢量化之外，您还编写了编译器现在展开时有问题的代码。

为了比较，让我们看看编译器实际上会从您的 "slow" C 实现中生成什么，只是没有页脚的热循环。

ICC, compiled with -O3 -march=skylake -ffast-math:

..B1.13:
    vmovups   ymm2, YMMWORD PTR [rsi+rdi*4]
    vmovups   ymm3, YMMWORD PTR [32+rsi+rdi*4]
    vfmadd231ps ymm1, ymm2, YMMWORD PTR [r8+rdi*4]
    vfmadd231ps ymm0, ymm3, YMMWORD PTR [32+r8+rdi*4]
    add       rdi, 16
    cmp       rdi, rax
    jb        ..B1.13

Clang, with the same parameters 更加悲观并将其展开为以下内容：

.LBB0_4:
    vmovups ymm4, ymmword ptr [rsi + 4*rcx]
    vmovups ymm5, ymmword ptr [rsi + 4*rcx + 32]
    vmovups ymm6, ymmword ptr [rsi + 4*rcx + 64]
    vmovups ymm7, ymmword ptr [rsi + 4*rcx + 96]
    vfmadd132ps     ymm4, ymm0, ymmword ptr [rdi + 4*rcx]
    vfmadd132ps     ymm5, ymm1, ymmword ptr [rdi + 4*rcx + 32]
    vfmadd132ps     ymm6, ymm2, ymmword ptr [rdi + 4*rcx + 64]
    vfmadd132ps     ymm7, ymm3, ymmword ptr [rdi + 4*rcx + 96]
    vmovups ymm0, ymmword ptr [rsi + 4*rcx + 128]
    vmovups ymm1, ymmword ptr [rsi + 4*rcx + 160]
    vmovups ymm2, ymmword ptr [rsi + 4*rcx + 192]
    vmovups ymm3, ymmword ptr [rsi + 4*rcx + 224]
    vfmadd132ps     ymm0, ymm4, ymmword ptr [rdi + 4*rcx + 128]
    vfmadd132ps     ymm1, ymm5, ymmword ptr [rdi + 4*rcx + 160]
    vfmadd132ps     ymm2, ymm6, ymmword ptr [rdi + 4*rcx + 192]
    vfmadd132ps     ymm3, ymm7, ymmword ptr [rdi + 4*rcx + 224]
    add     rcx, 64
    add     rax, 2
    jne     .LBB0_4

令人惊讶的是，两个编译器都已经能够使用 AVX 指令，不需要内在的黑客攻击。

但更有趣的是，两个编译器都认为一个累加寄存器不足以使 AVX 流水线饱和，而是分别使用 2 个和 4 个累加寄存器。在运行中进行更多操作有助于掩盖 FMA 的延迟，直至达到实际内存吞吐量限制。

只是不要忘记 -ffast-math 编译器选项，否则将最终累加从矢量化循环中拉出是不合法的。

GCC, also with the same options，实际上 "only" 与您的 "optimized" 解决方案一样好：

.L7:
    add     r8, 1
    vmovaps ymm3, YMMWORD PTR [r9+rax]
    vfmadd231ps     ymm1, ymm3, YMMWORD PTR [rcx+rax]
    add     rax, 32
    cmp     r8, r10
    jb      .L7

然而，GCC 在向该循环添加 header 方面仍然更聪明一些，因此它可以使用 vmovaps（对齐的内存访问）而不是 vmovups（未对齐的内存access) 第一次加载。

为完整起见，使用纯 AVX (-O3 -march=ivybridge -ffast-math)：

ICC:

..B1.12:
    vmovups   xmm2, XMMWORD PTR [r8+rdi*4]
    vmovups   xmm5, XMMWORD PTR [32+r8+rdi*4]
    vinsertf128 ymm3, ymm2, XMMWORD PTR [16+r8+rdi*4], 1
    vinsertf128 ymm6, ymm5, XMMWORD PTR [48+r8+rdi*4], 1
    vmulps    ymm4, ymm3, YMMWORD PTR [rsi+rdi*4]
    vmulps    ymm7, ymm6, YMMWORD PTR [32+rsi+rdi*4]
    vaddps    ymm1, ymm1, ymm4
    vaddps    ymm0, ymm0, ymm7
    add       rdi, 16
    cmp       rdi, rax
    jb        ..B1.12

叮当声：

.LBB0_5:
    vmovups xmm4, xmmword ptr [rdi + 4*rcx]
    vmovups xmm5, xmmword ptr [rdi + 4*rcx + 32]
    vmovups xmm6, xmmword ptr [rdi + 4*rcx + 64]
    vmovups xmm7, xmmword ptr [rdi + 4*rcx + 96]
    vinsertf128     ymm4, ymm4, xmmword ptr [rdi + 4*rcx + 16], 1
    vinsertf128     ymm5, ymm5, xmmword ptr [rdi + 4*rcx + 48], 1
    vinsertf128     ymm6, ymm6, xmmword ptr [rdi + 4*rcx + 80], 1
    vinsertf128     ymm7, ymm7, xmmword ptr [rdi + 4*rcx + 112], 1
    vmovups xmm8, xmmword ptr [rsi + 4*rcx]
    vmovups xmm9, xmmword ptr [rsi + 4*rcx + 32]
    vmovups xmm10, xmmword ptr [rsi + 4*rcx + 64]
    vmovups xmm11, xmmword ptr [rsi + 4*rcx + 96]
    vinsertf128     ymm8, ymm8, xmmword ptr [rsi + 4*rcx + 16], 1
    vmulps  ymm4, ymm8, ymm4
    vaddps  ymm0, ymm4, ymm0
    vinsertf128     ymm4, ymm9, xmmword ptr [rsi + 4*rcx + 48], 1
    vmulps  ymm4, ymm4, ymm5
    vaddps  ymm1, ymm4, ymm1
    vinsertf128     ymm4, ymm10, xmmword ptr [rsi + 4*rcx + 80], 1
    vmulps  ymm4, ymm4, ymm6
    vaddps  ymm2, ymm4, ymm2
    vinsertf128     ymm4, ymm11, xmmword ptr [rsi + 4*rcx + 112], 1
    vmulps  ymm4, ymm4, ymm7
    vaddps  ymm3, ymm4, ymm3
    add     rcx, 32
    cmp     rax, rcx
    jne     .LBB0_5

海湾合作委员会：

.L5:
    vmovups xmm3, XMMWORD PTR [rdi+rax]
    vinsertf128     ymm1, ymm3, XMMWORD PTR [rdi+16+rax], 0x1
    vmovups xmm4, XMMWORD PTR [rsi+rax]
    vinsertf128     ymm2, ymm4, XMMWORD PTR [rsi+16+rax], 0x1
    add     rax, 32
    vmulps  ymm1, ymm1, ymm2
    vaddps  ymm0, ymm0, ymm1
    cmp     rax, rcx
    jne     .L5

应用了几乎相同的优化，只是缺少一些额外的操作，因为缺少 FMA 并且不建议对 Ivy Bridge 进行未对齐的 256 位加载。

向量与 SIMD 的点积

Dot Product of Vectors with SIMD

c

x86

simd

avx