CUDA 优化不起作用
CUDA Optimization Not Working
我找到了这个 PDF (http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf),它向您介绍了几种优化 CUDA 中的归约操作的方法,我正在努力跟进。对于减少 #5,它建议使用以下代码展开循环的最后 6 次迭代:
if (tid < 32)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
上一张幻灯片甚至说:
- As reduction proceeds, # “active” threads decreases
- When s <= 32, we have only one warp left
- Instructions are SIMD synchronous within a warp
- That means when s <= 32:
- We don’t need to __syncthreads()
- We don’t need “if (tid < s)” because it doesn’t save any work
然而,当我尝试这种方法时,我从减少中得到的总和比以前的方法小得多。如果我在每次写入共享内存后添加 __syncthreads(),那么我会得到正确的结果。
关于"Instructions are SIMD synchronous within a warp"和"We don't need to __syncthreads()"的评论是不是真的?或者这是一个旧文档并且技术发生了变化?
您需要使用 volatile 关键字,正如 njuffa 在下面评论的那样。
此处有同一文档的更新版本。
https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
这是等效的示例 #6 以供参考。
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
我找到了这个 PDF (http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf),它向您介绍了几种优化 CUDA 中的归约操作的方法,我正在努力跟进。对于减少 #5,它建议使用以下代码展开循环的最后 6 次迭代:
if (tid < 32)
{
sdata[tid] += sdata[tid + 32];
sdata[tid] += sdata[tid + 16];
sdata[tid] += sdata[tid + 8];
sdata[tid] += sdata[tid + 4];
sdata[tid] += sdata[tid + 2];
sdata[tid] += sdata[tid + 1];
}
上一张幻灯片甚至说:
- As reduction proceeds, # “active” threads decreases
- When s <= 32, we have only one warp left
- Instructions are SIMD synchronous within a warp
- That means when s <= 32:
- We don’t need to __syncthreads()
- We don’t need “if (tid < s)” because it doesn’t save any work
然而,当我尝试这种方法时,我从减少中得到的总和比以前的方法小得多。如果我在每次写入共享内存后添加 __syncthreads(),那么我会得到正确的结果。
关于"Instructions are SIMD synchronous within a warp"和"We don't need to __syncthreads()"的评论是不是真的?或者这是一个旧文档并且技术发生了变化?
您需要使用 volatile 关键字,正如 njuffa 在下面评论的那样。
此处有同一文档的更新版本。 https://developer.download.nvidia.com/assets/cuda/files/reduction.pdf
这是等效的示例 #6 以供参考。
template <unsigned int blockSize>
__device__ void warpReduce(volatile int *sdata, unsigned int tid) {
if (blockSize >= 64) sdata[tid] += sdata[tid + 32];
if (blockSize >= 32) sdata[tid] += sdata[tid + 16];
if (blockSize >= 16) sdata[tid] += sdata[tid + 8];
if (blockSize >= 8) sdata[tid] += sdata[tid + 4];
if (blockSize >= 4) sdata[tid] += sdata[tid + 2];
if (blockSize >= 2) sdata[tid] += sdata[tid + 1];
}
template <unsigned int blockSize>
__global__ void reduce6(int *g_idata, int *g_odata, unsigned int n) {
extern __shared__ int sdata[];
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x*(blockSize*2) + tid;
unsigned int gridSize = blockSize*2*gridDim.x;
sdata[tid] = 0;
while (i < n) { sdata[tid] += g_idata[i] + g_idata[i+blockSize]; i += gridSize; }
__syncthreads();
if (blockSize >= 512) { if (tid < 256) { sdata[tid] += sdata[tid + 256]; } __syncthreads(); }
if (blockSize >= 256) { if (tid < 128) { sdata[tid] += sdata[tid + 128]; } __syncthreads(); }
if (blockSize >= 128) { if (tid < 64) { sdata[tid] += sdata[tid + 64]; } __syncthreads(); }
if (tid < 32) warpReduce(sdata, tid);
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}