Cuda共享内存错误
Cuda shared memory bug
我有这个简单的基数排序(它只按一位排序,而且只有一个块)。我的第一个版本有效,但我尝试首先对共享内存上的键进行排序,以实现合并内存写入 DRAM。但是这个版本产生了不好的结果,它没有排序。
第一个简单的工作版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
int a=s_sum[pos];
int b=s_sum[tid];
__syncthreads();
s_sum[tid]=a+b;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
out_keys[offset]=in_keys[tid];
}
第二个版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
int a=s_sum[pos];
int b=s_sum[tid];
__syncthreads();
s_sum[tid]=a+b;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=in_keys[tid];
__syncthreads();
out_keys[tid]=s_sum[tid];
}
问题是我在条件代码上调用 __syncthreads()。只允许在块中所有线程具有相同执行路径的条件代码上调用 __syncthreads()。
正确版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
__shared__ unsigned int s_sum[1024];
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
unsigned int v=in_keys[tid];
unsigned int first=((v>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; off<1024;){
int a,b;
if (pos>=0){
a=s_sum[pos];
b=s_sum[tid];
}
__syncthreads();
if (pos>=0){
s_sum[tid]=a+b;
}
__syncthreads();
off=off*2;
pos=tid-off;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=v;
__syncthreads();
out_keys[tid]=s_sum[tid];
}
我有这个简单的基数排序(它只按一位排序,而且只有一个块)。我的第一个版本有效,但我尝试首先对共享内存上的键进行排序,以实现合并内存写入 DRAM。但是这个版本产生了不好的结果,它没有排序。
第一个简单的工作版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
int a=s_sum[pos];
int b=s_sum[tid];
__syncthreads();
s_sum[tid]=a+b;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
out_keys[offset]=in_keys[tid];
}
第二个版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
__shared__ unsigned int s_sum[1024];
unsigned int first=((in_keys[tid]>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; pos>=0; off=off*2, pos=tid-off){
int a=s_sum[pos];
int b=s_sum[tid];
__syncthreads();
s_sum[tid]=a+b;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=in_keys[tid];
__syncthreads();
out_keys[tid]=s_sum[tid];
}
问题是我在条件代码上调用 __syncthreads()。只允许在块中所有线程具有相同执行路径的条件代码上调用 __syncthreads()。 正确版本:
__global__ void dev_radix(unsigned int *in_keys, const unsigned int *histo, unsigned int desp, unsigned int *out_keys){
__shared__ unsigned int s_sum[1024];
int tid=threadIdx.x;
//Get offset by using prefix sum scan.
unsigned int v=in_keys[tid];
unsigned int first=((v>>desp)&1)==0;
s_sum[tid]=first;
__syncthreads();
int pos=tid-1;
for (int off=1; off<1024;){
int a,b;
if (pos>=0){
a=s_sum[pos];
b=s_sum[tid];
}
__syncthreads();
if (pos>=0){
s_sum[tid]=a+b;
}
__syncthreads();
off=off*2;
pos=tid-off;
}
__syncthreads();
int offset=s_sum[tid]-first;
if (first==0){
//Get offset for '1' bit keys
offset=histo[0]+tid-offset;
}
__syncthreads();
s_sum[offset]=v;
__syncthreads();
out_keys[tid]=s_sum[tid];
}