关于 Cuda 1D 卷积,我怎样才能更快地做到这一点?

About Cuda 1D convolution, How can I do this faster?

__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{

    int i = blockIdx.x*blockDim.x + threadIdx.x; 
    int j = blockIdx.y; //129
    int k = blockIdx.z; //50

    if (j < coefsize && i < 250000 && k < 50)
    {
        if (i - j >= 0 && i - j < 250000)
        {
            atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]); 
        }
    }
}

许多人推荐使用 FFT 进行卷积,但在这种情况下,两个数组的大小有很大的差异(129 和 250000)。所以与 FFT 的卷积比这种方法慢。

我认为这里不需要原子。您唯一会遇到的线程冲突是在 y 维度中,因此我们可以简单地减少您的整体网格(在 y 中)并将操作转换为计算 运行 总和的循环。即使没有 y 维度,您的网格中也有大量线程可以使任何 GPU 饱和。

这是一个例子:

$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{

    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y; //129
    int k = blockIdx.z; //50

    if (j < coefsize && i < 250000 && k < 50)
    {
        if (i - j >= 0 && i - j < 250000)
        {
            atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
        }
    }
}

__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{

    int i = blockIdx.x*blockDim.x + threadIdx.x;
    //int j = blockIdx.y; //129
    int k = blockIdx.z; //50

    if (i < 250000 && k < 50)
    {
        float s = 0;
        for (int j = 0; j < 129; j++)
          if  (i - j >= 0 && i - j < 250000) s += coef[j] * in[k*250000 + i - j];
        out[k*250000 + i] += s;
    }
}

int main(){
  int num_c = 50;
  int csz = 250000;
  int coefsize = 129;
  int isz = num_c*csz;
  int osz = num_c*csz;
  float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
  cudaMalloc(&d_in,   isz*sizeof(float));
  cudaMalloc(&d_out,  osz*sizeof(float));
  cudaMalloc(&d_coef, coefsize*sizeof(float));
  h_in    = new float[isz];
  h_out   = new float[osz];
  h_out_i = new float[osz];
  h_coef  = new float[coefsize];
  cudaMemset(d_out, 0, osz*sizeof(float));
  for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
  for (int i = 0; i < isz; i++) h_in[i] = i%4;
  cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
  int threads = 128;
  dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
  D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
  cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
  dim3 blocks2((csz+threads-1)/threads, 1, num_c);
  cudaMemset(d_out, 0, osz*sizeof(float));
  D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
  cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==14221== NVPROF is profiling process 14221, command: ./t20
==14221== Profiling application: ./t20
==14221== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   53.54%  43.853ms         2  21.926ms  21.863ms  21.989ms  [CUDA memcpy DtoH]
                   26.97%  22.087ms         1  22.087ms  22.087ms  22.087ms  D_Conv(float*, float*, float*, int)
                   17.30%  14.172ms         2  7.0860ms  1.4400us  14.171ms  [CUDA memcpy HtoD]
                    2.04%  1.6702ms         1  1.6702ms  1.6702ms  1.6702ms  D_Conv_i(float*, float*, float*, int)
                    0.14%  118.24us         2  59.122us  56.386us  61.858us  [CUDA memset]
      API calls:   75.11%  270.97ms         3  90.322ms  189.31us  270.50ms  cudaMalloc
                   23.11%  83.367ms         4  20.842ms  45.694us  44.579ms  cudaMemcpy
                    1.07%  3.8698ms         4  967.45us  449.83us  2.5106ms  cuDeviceTotalMem
                    0.59%  2.1262ms       404  5.2620us     332ns  230.46us  cuDeviceGetAttribute
                    0.06%  223.31us         4  55.828us  47.710us  74.669us  cuDeviceGetName
                    0.03%  98.648us         2  49.324us  31.800us  66.848us  cudaMemset
                    0.02%  86.603us         2  43.301us  13.778us  72.825us  cudaLaunchKernel
                    0.01%  21.169us         4  5.2920us  3.2030us  8.0240us  cuDeviceGetPCIBusId
                    0.00%  11.459us         8  1.4320us     427ns  4.2700us  cuDeviceGet
                    0.00%  3.6360us         3  1.2120us     563ns  1.6820us  cuDeviceGetCount
                    0.00%  2.7220us         4     680ns     520ns     877ns  cuDeviceGetUuid
$

(CUDA 11.1U1,特斯拉 V100)

我们可以看到原子内核运行时间超过 20ms,而非原子内核运行时间不到 2ms。另请注意,我 运行 每个块有 128 个线程,而不是 32 个。不确定你为什么选择 32,我的目标是 64 或更高。

因为 coef 数组的大小相对较小,并且跨 warp 的访问模式是统一的,我们可以利用 __constant__ 内存来存储此数据。这提供了额外的加速:

$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{

    int i = blockIdx.x*blockDim.x + threadIdx.x;
    int j = blockIdx.y; //129
    int k = blockIdx.z; //50

    if (j < coefsize && i < 250000 && k < 50)
    {
        if (i - j >= 0 && i - j < 250000)
        {
            atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
        }
    }
}
__constant__ float Ccoef[129];
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{

    int i = blockIdx.x*blockDim.x + threadIdx.x;
    //int j = blockIdx.y; //129
    int k = blockIdx.z; //50

    if (i < 250000 && k < 50)
    {
        float s = 0;
        for (int j = 0; j < 129; j++)
          if  (i - j >= 0 && i - j < 250000) s += Ccoef[j] * in[k*250000 + i - j];
        out[k*250000 + i] += s;
    }
}

int main(){
  int num_c = 50;
  int csz = 250000;
  int coefsize = 129;
  int isz = num_c*csz;
  int osz = num_c*csz;
  float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
  cudaMalloc(&d_in,   isz*sizeof(float));
  cudaMalloc(&d_out,  osz*sizeof(float));
  cudaMalloc(&d_coef, coefsize*sizeof(float));
  h_in    = new float[isz];
  h_out   = new float[osz];
  h_out_i = new float[osz];
  h_coef  = new float[coefsize];
  cudaMemset(d_out, 0, osz*sizeof(float));
  for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
  for (int i = 0; i < isz; i++) h_in[i] = i%4;
  cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
  cudaMemcpyToSymbol(Ccoef, h_coef, coefsize*sizeof(float));
  int threads = 128;
  dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
  D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
  cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
  dim3 blocks2((csz+threads-1)/threads, 1, num_c);
  cudaMemset(d_out, 0, osz*sizeof(float));
  D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
  cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
  for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==2191== NVPROF is profiling process 2191, command: ./t20
==2191== Profiling application: ./t20
==2191== Profiling result:
            Type  Time(%)      Time     Calls       Avg       Min       Max  Name
 GPU activities:   54.38%  44.047ms         2  22.024ms  21.997ms  22.051ms  [CUDA memcpy DtoH]
                   27.25%  22.075ms         1  22.075ms  22.075ms  22.075ms  D_Conv(float*, float*, float*, int)
                   17.15%  13.888ms         3  4.6294ms  1.4720us  13.885ms  [CUDA memcpy HtoD]
                    1.07%  869.88us         1  869.88us  869.88us  869.88us  D_Conv_i(float*, float*, float*, int)
                    0.15%  117.83us         2  58.913us  56.321us  61.505us  [CUDA memset]
      API calls:   77.28%  307.94ms         3  102.65ms  188.61us  307.49ms  cudaMalloc
                   20.70%  82.467ms         4  20.617ms  48.300us  44.617ms  cudaMemcpy
                    1.27%  5.0520ms         4  1.2630ms  593.63us  3.2465ms  cuDeviceTotalMem
                    0.62%  2.4765ms       404  6.1290us     450ns  261.77us  cuDeviceGetAttribute
                    0.07%  271.54us         4  67.884us  59.173us  88.716us  cuDeviceGetName
                    0.02%  97.041us         2  48.520us  30.831us  66.210us  cudaMemset
                    0.02%  86.276us         2  43.138us  14.800us  71.476us  cudaLaunchKernel
                    0.01%  23.142us         1  23.142us  23.142us  23.142us  cudaMemcpyToSymbol
                    0.01%  21.576us         4  5.3940us  3.0900us  8.4600us  cuDeviceGetPCIBusId
                    0.00%  13.604us         8  1.7000us     667ns  4.4800us  cuDeviceGet
                    0.00%  5.7060us         3  1.9020us     452ns  3.5840us  cuDeviceGetCount
                    0.00%  3.2440us         4     811ns     660ns  1.0340us  cuDeviceGetUuid
$

改进后的内核现在运行时间不到 1 毫秒,速度提高了约 20 倍。