为什么以下代码的 AVX 加速没有达到预期？

Question

我正在使用基本 _mm256_mul_ps 和 _mm256_add_ps 在 AVX 中编程。然后将其与不使用 AVX 的正常操作方式进行比较。因为我在 AVX 中同时使用浮动和加载 8 个项目，所以为什么我的加速不超过 3/4，而从逻辑上讲，与正常操作相比它应该是 8。请查看我的代码并提出建议。谢谢

inline double timestamp() {
    struct timeval tp;
    gettimeofday(&tp, NULL);
    return double(tp.tv_sec) + tp.tv_usec / 1000000.;
}

void AVXsum(float *a, float *b, float *c, int ARR_SIZE){

        printf("AVX Addition:\n\n");

        for (int i=0; i < ARR_SIZE ; i+=8){

         __m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a

         __m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b

         __m256 res __attribute__(( aligned(32))) = _mm256_add_ps(vecA,vecB); // adding 8 values of array a and b

         _mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c

         printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);

        }

        printf("\n");

}

void AVXmul(float *a, float *b, float *c, int ARR_SIZE){

        printf("AVX Multiplication:\n\n");

        for (int i=0; i < ARR_SIZE ; i+=8){

         __m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);

         __m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);

         __m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(vecA,vecB);

         _mm256_store_ps(&c[i],res);

         printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);

        }

        printf("\n");

}

void Normalsum(float *a, float *b, float *c, int ARR_SIZE){

        printf("Normal Addition:\n\n");

        float add;

        for (int i=0; i < ARR_SIZE ; i++){

         add = a[i] + b[i];
         c[i] = add;

         printf("%f\t", c[i]);

        }

        printf("\n\n");

}

void Normalmul(float *a, float *b, float *c, int ARR_SIZE){

        printf("Normal Multiplication:\n\n");

        float mult;

        for (int i=0; i < ARR_SIZE ; i++){

         mult = a[i] * b[i];
         c[i] = mult;

         printf("%f\t", c[i]);

        }

        printf("\n");

}

int main(){

    double time, normalsumTime, normalmulTime, avxsumTime, avxmulTime;

    int size;
    printf("Insert the size of array: ");
    scanf("%d", &size);

    // initialization of array and generating random value as per entered size stated above
    float a[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        a[i] = (rand()%100)+1;
    }

    float b[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        b[i] = (rand()%100)+1;
    }

    int arrsize = sizeof(a) / sizeof (a[0]);
    float c[arrsize] __attribute__(( aligned(32)));


    //the function is called and time is calculated
    time = timestamp();
    Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
    normalsumTime = timestamp() - time;

    time = timestamp();
    Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
    normalmulTime = timestamp() - time;

    time = timestamp();
    AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
    avxsumTime = timestamp() - time;

    time = timestamp();
    AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
    avxmulTime = timestamp() - time;

    //printing the output
    cout << "Normal Sum took " << normalsumTime << " s" << endl;
    cout << "Normal Mul took " << normalmulTime << " s" << endl;
    cout << "AVX Sum took " << avxsumTime << " s" << endl;
    cout << "AVX Mul took " << avxmulTime << " s" << endl;
    cout << "Sum SpeedUP AVX2= " << normalsumTime / avxsumTime << endl;
    cout << "Mul SpeedUP AVX2= " << normalmulTime / avxmulTime << endl;
    cout << "===========================" << endl;

   return 0;

}

Answer 1

好像

你不重复你的功能以获得最佳时间。这是必不可少的！
您的函数不应包含 printf 等函数
_mm256_store_ps 指令在每次迭代中使用，您的程序不仅仅使用 AVX 计算指令。换句话说，内存访问指令违反了性能并且没有产生 8 倍的加速。

我更改了您的实现以获得更准确的结果

#include <x86intrin.h>
#include <stdio.h>
#include <time.h>


inline void AVXsum(float *a, float *b, float *c, int ARR_SIZE)
{
    for (int i=0; i < ARR_SIZE ; i+=8){
        //__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]); // loading 8 values starting from the address of "i"th value of array a
        //__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]); // loading 8 values starting from the address of "i"th value of array b

        __m256 res __attribute__(( aligned(32))) = _mm256_add_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i])); // adding 8 values of array a and b

        _mm256_store_ps(&c[i],res); // storing the value in the "i"th address of another array c

        //printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
    }
}

inline void AVXmul(float *a, float *b, float *c, int ARR_SIZE)
{
    for (int i=0; i < ARR_SIZE ; i+=8){
        //__m256 vecA __attribute__(( aligned(32))) = _mm256_load_ps(&a[i]);
        //__m256 vecB __attribute__(( aligned(32))) = _mm256_load_ps(&b[i]);

        __m256 res __attribute__(( aligned(32))) = _mm256_mul_ps(_mm256_load_ps(&a[i]),_mm256_load_ps(&b[i]));

        _mm256_store_ps(&c[i],res);

        //printf("%f %f %f %f %f %f %f %f\n", c[i + 0], c[i + 1], c[i + 2], c[i + 3], c[i + 4], c[i + 5], c[i + 6], c[i + 7]);
    }
    //printf("\n");
}

inline void Normalsum(float *a, float *b, float *c, int ARR_SIZE)
{
    float add;
    for (int i=0; i < ARR_SIZE ; i++){

        add = a[i] + b[i];
        c[i] = add;

    // printf("%f\t", c[i]);
    }
    // printf("\n\n");
}

inline void Normalmul(float *a, float *b, float *c, int ARR_SIZE)
{
    float mult;
    for (int i=0; i < ARR_SIZE ; i++){

        mult = a[i] * b[i];
        c[i] = mult;

        //printf("%f\t", c[i]);
    }

    //printf("\n");
}
#define size 10000
#define arrsize size
#define NUM_LOOP 1000000

int main(){

    double  normalsumTime, normalmulTime, avxsumTime, avxmulTime;
    struct timespec tStart, tEnd;
    double tTotal , tBest=10000;
    int w =0;// do-while loop counter

    //int size;
    printf("the size of array is: %d \n", size);
    //scanf("%d", &size);

    // initialization of array and generating random value as per entered size stated above
    float a[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        a[i] = (rand()%100)+1;
    }

    float b[size] __attribute__(( aligned(32)));
    for(int i=0; i<size; i++){
        b[i] = (rand()%100)+1;
    }

    //int arrsize = sizeof(a) / sizeof (a[0]);
    float c[arrsize] __attribute__(( aligned(32)));

    //the function is called and time is calculated
    printf("\nNormal Addition ... :\n\n");
    do{// this loop repeat the body to record the best time
        clock_gettime(CLOCK_MONOTONIC,&tStart);

        //time = timestamp();
        Normalsum((float*)&a, (float*)&b , (float*)&c, arrsize);
        //normalsumTime = timestamp() - time;

        clock_gettime(CLOCK_MONOTONIC,&tEnd);
        tTotal = (tEnd.tv_sec - tStart.tv_sec);
        tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;

        if(tTotal<tBest)
            tBest=tTotal;
    } while(w++ < NUM_LOOP);

    normalsumTime = tBest;
    tBest = 100000;
    w=0;

    printf("Normal Multiplication .... \n\n");
    do{// this loop repeat the body to record the best time
        clock_gettime(CLOCK_MONOTONIC,&tStart);

        //time = timestamp();
        Normalmul((float*)&a, (float*)&b , (float*)&c, arrsize);
        //normalmulTime = timestamp() - time;

        clock_gettime(CLOCK_MONOTONIC,&tEnd);
        tTotal = (tEnd.tv_sec - tStart.tv_sec);
        tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;

        if(tTotal<tBest)
            tBest=tTotal;
    } while(w++ < NUM_LOOP);

    normalmulTime = tBest;
    tBest = 100000;
    w=0;

    printf("AVX Addition....\n\n");
    do{// this loop repeat the body to record the best time
        clock_gettime(CLOCK_MONOTONIC,&tStart);

        //time = timestamp();
        AVXsum((float*)&a, (float*)&b , (float*)&c, arrsize);
        //avxsumTime = timestamp() - time;

        clock_gettime(CLOCK_MONOTONIC,&tEnd);
        tTotal = (tEnd.tv_sec - tStart.tv_sec);
        tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;

        if(tTotal<tBest)
            tBest=tTotal;
    } while(w++ < NUM_LOOP);

    avxsumTime = tBest;
    tBest = 100000;
    w=0;

    printf("AVX Multiplication ....\n\n");
    do{// this loop repeat the body to record the best time
        clock_gettime(CLOCK_MONOTONIC,&tStart);
        //time = timestamp();

        AVXmul((float*)&a, (float*)&b , (float*)&c, arrsize);
        //avxmulTime = timestamp() - time;

        clock_gettime(CLOCK_MONOTONIC,&tEnd);
        tTotal = (tEnd.tv_sec - tStart.tv_sec);
        tTotal += (tEnd.tv_nsec - tStart.tv_nsec) / 1000000000.0;

        if(tTotal<tBest)
            tBest=tTotal;
    } while(w++ < NUM_LOOP);

    avxmulTime = tBest;

    //printing the output
    printf("Normal Sum took %lf s\n" , normalsumTime);
    printf("Normal Mul took %lf s\n",  normalmulTime);
    printf("AVX Sum took %lf s \n", avxsumTime);
    printf( "AVX Mul took %lf s\n", avxmulTime);
    printf("Sum SpeedUP AVX= %lf ", normalsumTime / avxsumTime );
    printf("Mul SpeedUP AVX= %lf \n", normalmulTime / avxmulTime );
    printf( "===========================\n");

    return 0;

}

输出为：

//gcc -O2 
//skylake
Normal Sum took 0.000005 s
Normal Mul took 0.000005 s
AVX Sum took 0.000001 s 
AVX Mul took 0.000001 s
Sum SpeedUP AVX= 4.418283 Mul SpeedUP AVX= 4.491080

为什么以下代码的 AVX 加速没有达到预期？

Why AVX speedup for the following code is not as expected?

c++

parallel-processing

sse

simd

avx