关于 Cuda 1D 卷积,我怎样才能更快地做到这一点?
About Cuda 1D convolution, How can I do this faster?
- int 线程数 = 32;
- dim3 块(250000/31,129,50);
- 系数大小 = 129;
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
许多人推荐使用 FFT 进行卷积,但在这种情况下,两个数组的大小有很大的差异(129 和 250000)。所以与 FFT 的卷积比这种方法慢。
我认为这里不需要原子。您唯一会遇到的线程冲突是在 y
维度中,因此我们可以简单地减少您的整体网格(在 y
中)并将操作转换为计算 运行 总和的循环。即使没有 y
维度,您的网格中也有大量线程可以使任何 GPU 饱和。
这是一个例子:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += coef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==14221== NVPROF is profiling process 14221, command: ./t20
==14221== Profiling application: ./t20
==14221== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.54% 43.853ms 2 21.926ms 21.863ms 21.989ms [CUDA memcpy DtoH]
26.97% 22.087ms 1 22.087ms 22.087ms 22.087ms D_Conv(float*, float*, float*, int)
17.30% 14.172ms 2 7.0860ms 1.4400us 14.171ms [CUDA memcpy HtoD]
2.04% 1.6702ms 1 1.6702ms 1.6702ms 1.6702ms D_Conv_i(float*, float*, float*, int)
0.14% 118.24us 2 59.122us 56.386us 61.858us [CUDA memset]
API calls: 75.11% 270.97ms 3 90.322ms 189.31us 270.50ms cudaMalloc
23.11% 83.367ms 4 20.842ms 45.694us 44.579ms cudaMemcpy
1.07% 3.8698ms 4 967.45us 449.83us 2.5106ms cuDeviceTotalMem
0.59% 2.1262ms 404 5.2620us 332ns 230.46us cuDeviceGetAttribute
0.06% 223.31us 4 55.828us 47.710us 74.669us cuDeviceGetName
0.03% 98.648us 2 49.324us 31.800us 66.848us cudaMemset
0.02% 86.603us 2 43.301us 13.778us 72.825us cudaLaunchKernel
0.01% 21.169us 4 5.2920us 3.2030us 8.0240us cuDeviceGetPCIBusId
0.00% 11.459us 8 1.4320us 427ns 4.2700us cuDeviceGet
0.00% 3.6360us 3 1.2120us 563ns 1.6820us cuDeviceGetCount
0.00% 2.7220us 4 680ns 520ns 877ns cuDeviceGetUuid
$
(CUDA 11.1U1,特斯拉 V100)
我们可以看到原子内核运行时间超过 20ms,而非原子内核运行时间不到 2ms。另请注意,我 运行 每个块有 128 个线程,而不是 32 个。不确定你为什么选择 32,我的目标是 64 或更高。
因为 coef
数组的大小相对较小,并且跨 warp 的访问模式是统一的,我们可以利用 __constant__
内存来存储此数据。这提供了额外的加速:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__constant__ float Ccoef[129];
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += Ccoef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(Ccoef, h_coef, coefsize*sizeof(float));
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==2191== NVPROF is profiling process 2191, command: ./t20
==2191== Profiling application: ./t20
==2191== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.38% 44.047ms 2 22.024ms 21.997ms 22.051ms [CUDA memcpy DtoH]
27.25% 22.075ms 1 22.075ms 22.075ms 22.075ms D_Conv(float*, float*, float*, int)
17.15% 13.888ms 3 4.6294ms 1.4720us 13.885ms [CUDA memcpy HtoD]
1.07% 869.88us 1 869.88us 869.88us 869.88us D_Conv_i(float*, float*, float*, int)
0.15% 117.83us 2 58.913us 56.321us 61.505us [CUDA memset]
API calls: 77.28% 307.94ms 3 102.65ms 188.61us 307.49ms cudaMalloc
20.70% 82.467ms 4 20.617ms 48.300us 44.617ms cudaMemcpy
1.27% 5.0520ms 4 1.2630ms 593.63us 3.2465ms cuDeviceTotalMem
0.62% 2.4765ms 404 6.1290us 450ns 261.77us cuDeviceGetAttribute
0.07% 271.54us 4 67.884us 59.173us 88.716us cuDeviceGetName
0.02% 97.041us 2 48.520us 30.831us 66.210us cudaMemset
0.02% 86.276us 2 43.138us 14.800us 71.476us cudaLaunchKernel
0.01% 23.142us 1 23.142us 23.142us 23.142us cudaMemcpyToSymbol
0.01% 21.576us 4 5.3940us 3.0900us 8.4600us cuDeviceGetPCIBusId
0.00% 13.604us 8 1.7000us 667ns 4.4800us cuDeviceGet
0.00% 5.7060us 3 1.9020us 452ns 3.5840us cuDeviceGetCount
0.00% 3.2440us 4 811ns 660ns 1.0340us cuDeviceGetUuid
$
改进后的内核现在运行时间不到 1 毫秒,速度提高了约 20 倍。
- int 线程数 = 32;
- dim3 块(250000/31,129,50);
- 系数大小 = 129;
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
许多人推荐使用 FFT 进行卷积,但在这种情况下,两个数组的大小有很大的差异(129 和 250000)。所以与 FFT 的卷积比这种方法慢。
我认为这里不需要原子。您唯一会遇到的线程冲突是在 y
维度中,因此我们可以简单地减少您的整体网格(在 y
中)并将操作转换为计算 运行 总和的循环。即使没有 y
维度,您的网格中也有大量线程可以使任何 GPU 饱和。
这是一个例子:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += coef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==14221== NVPROF is profiling process 14221, command: ./t20
==14221== Profiling application: ./t20
==14221== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 53.54% 43.853ms 2 21.926ms 21.863ms 21.989ms [CUDA memcpy DtoH]
26.97% 22.087ms 1 22.087ms 22.087ms 22.087ms D_Conv(float*, float*, float*, int)
17.30% 14.172ms 2 7.0860ms 1.4400us 14.171ms [CUDA memcpy HtoD]
2.04% 1.6702ms 1 1.6702ms 1.6702ms 1.6702ms D_Conv_i(float*, float*, float*, int)
0.14% 118.24us 2 59.122us 56.386us 61.858us [CUDA memset]
API calls: 75.11% 270.97ms 3 90.322ms 189.31us 270.50ms cudaMalloc
23.11% 83.367ms 4 20.842ms 45.694us 44.579ms cudaMemcpy
1.07% 3.8698ms 4 967.45us 449.83us 2.5106ms cuDeviceTotalMem
0.59% 2.1262ms 404 5.2620us 332ns 230.46us cuDeviceGetAttribute
0.06% 223.31us 4 55.828us 47.710us 74.669us cuDeviceGetName
0.03% 98.648us 2 49.324us 31.800us 66.848us cudaMemset
0.02% 86.603us 2 43.301us 13.778us 72.825us cudaLaunchKernel
0.01% 21.169us 4 5.2920us 3.2030us 8.0240us cuDeviceGetPCIBusId
0.00% 11.459us 8 1.4320us 427ns 4.2700us cuDeviceGet
0.00% 3.6360us 3 1.2120us 563ns 1.6820us cuDeviceGetCount
0.00% 2.7220us 4 680ns 520ns 877ns cuDeviceGetUuid
$
(CUDA 11.1U1,特斯拉 V100)
我们可以看到原子内核运行时间超过 20ms,而非原子内核运行时间不到 2ms。另请注意,我 运行 每个块有 128 个线程,而不是 32 个。不确定你为什么选择 32,我的目标是 64 或更高。
因为 coef
数组的大小相对较小,并且跨 warp 的访问模式是统一的,我们可以利用 __constant__
内存来存储此数据。这提供了额外的加速:
$ cat t20.cu
#include <iostream>
#define TOL 0.1
__global__ void D_Conv(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (j < coefsize && i < 250000 && k < 50)
{
if (i - j >= 0 && i - j < 250000)
{
atomicAdd(&out[k*250000 + i], coef[j] * in[k*250000 + i - j]);
}
}
}
__constant__ float Ccoef[129];
__global__ void D_Conv_i(float *in, float* coef, float *out, int coefsize)
{
int i = blockIdx.x*blockDim.x + threadIdx.x;
//int j = blockIdx.y; //129
int k = blockIdx.z; //50
if (i < 250000 && k < 50)
{
float s = 0;
for (int j = 0; j < 129; j++)
if (i - j >= 0 && i - j < 250000) s += Ccoef[j] * in[k*250000 + i - j];
out[k*250000 + i] += s;
}
}
int main(){
int num_c = 50;
int csz = 250000;
int coefsize = 129;
int isz = num_c*csz;
int osz = num_c*csz;
float *d_in, *h_in, *d_coef, *h_coef, *d_out, *h_out, *h_out_i;
cudaMalloc(&d_in, isz*sizeof(float));
cudaMalloc(&d_out, osz*sizeof(float));
cudaMalloc(&d_coef, coefsize*sizeof(float));
h_in = new float[isz];
h_out = new float[osz];
h_out_i = new float[osz];
h_coef = new float[coefsize];
cudaMemset(d_out, 0, osz*sizeof(float));
for (int i = 0; i < coefsize; i++) h_coef[i] = i%5;
for (int i = 0; i < isz; i++) h_in[i] = i%4;
cudaMemcpy(d_in, h_in, isz*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpy(d_coef, h_coef, coefsize*sizeof(float), cudaMemcpyHostToDevice);
cudaMemcpyToSymbol(Ccoef, h_coef, coefsize*sizeof(float));
int threads = 128;
dim3 blocks((csz+threads-1)/threads, coefsize, num_c);
D_Conv<<<blocks, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
dim3 blocks2((csz+threads-1)/threads, 1, num_c);
cudaMemset(d_out, 0, osz*sizeof(float));
D_Conv_i<<<blocks2, threads>>>(d_in, d_coef, d_out, coefsize);
cudaMemcpy(h_out_i, d_out, osz*sizeof(float), cudaMemcpyDeviceToHost);
for (int i = 0; i < osz; i++) if (fabsf(h_out_i[i] - h_out[i]) > TOL) {std::cout << "mismatch at: " << i << " was: " << h_out_i[i] << " should be: " << h_out[i] << std::endl; return 0;}
}
$ nvcc -o t20 t20.cu
$ cuda-memcheck ./t20
========= CUDA-MEMCHECK
========= ERROR SUMMARY: 0 errors
$ nvprof ./t20
==2191== NVPROF is profiling process 2191, command: ./t20
==2191== Profiling application: ./t20
==2191== Profiling result:
Type Time(%) Time Calls Avg Min Max Name
GPU activities: 54.38% 44.047ms 2 22.024ms 21.997ms 22.051ms [CUDA memcpy DtoH]
27.25% 22.075ms 1 22.075ms 22.075ms 22.075ms D_Conv(float*, float*, float*, int)
17.15% 13.888ms 3 4.6294ms 1.4720us 13.885ms [CUDA memcpy HtoD]
1.07% 869.88us 1 869.88us 869.88us 869.88us D_Conv_i(float*, float*, float*, int)
0.15% 117.83us 2 58.913us 56.321us 61.505us [CUDA memset]
API calls: 77.28% 307.94ms 3 102.65ms 188.61us 307.49ms cudaMalloc
20.70% 82.467ms 4 20.617ms 48.300us 44.617ms cudaMemcpy
1.27% 5.0520ms 4 1.2630ms 593.63us 3.2465ms cuDeviceTotalMem
0.62% 2.4765ms 404 6.1290us 450ns 261.77us cuDeviceGetAttribute
0.07% 271.54us 4 67.884us 59.173us 88.716us cuDeviceGetName
0.02% 97.041us 2 48.520us 30.831us 66.210us cudaMemset
0.02% 86.276us 2 43.138us 14.800us 71.476us cudaLaunchKernel
0.01% 23.142us 1 23.142us 23.142us 23.142us cudaMemcpyToSymbol
0.01% 21.576us 4 5.3940us 3.0900us 8.4600us cuDeviceGetPCIBusId
0.00% 13.604us 8 1.7000us 667ns 4.4800us cuDeviceGet
0.00% 5.7060us 3 1.9020us 452ns 3.5840us cuDeviceGetCount
0.00% 3.2440us 4 811ns 660ns 1.0340us cuDeviceGetUuid
$
改进后的内核现在运行时间不到 1 毫秒,速度提高了约 20 倍。