程序启动时调用 cudaDeviceSynchronize() 后未指定的启动失败。但是使用逐步调试没有错误。 CUDA

Unspecified launch failure after cudaDeviceSynchronize() call when program starts. But no errors using step-through debugging. CUDA

我花了好几个小时与 unspecified launch failure 作斗争。 为了了解共享内存的工作原理,我为自己想出了一个小任务。

任务是将数组[1, 2, 3, ... , N] 分成K组(N / K)个元素,并求出每组的和。 (数组的当前元素和前一个元素之差等于 1)

我计划在划分为 K 个块的网格中使用 N 个线程。所以每个线程块包含 (N / K) 个线程。因此,一个线程块可用于计算一组的总和。我也想动态分配共享内存。

当我启动程序时,我在 cudaDeviceSynchronize() 调用后得到了 unspecified launch failure。但是当我尝试逐步调试时,一切正常并且工作正常。

我做错了什么? (Visual Studio 2012 Professional,Compute Capability 2.1)非常感谢任何帮助。

#include <stdio.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    printf("%s\n",cudaGetErrorString(x)); \
    system("pause"); \
    return EXIT_FAILURE;}} while(0)

extern __shared__ double shrd[];

__global__ void kernel(double * a){
    size_t threadID_block = blockDim.x * threadIdx.y + threadIdx.x;
    size_t blockID_global = (gridDim.x * blockIdx.y + blockIdx.x );
    size_t threadID_global = blockID_global * blockDim.x * blockDim.y + threadID_block;
    double * temp = &shrd[blockID_global * blockDim.x * blockDim.y];
    temp[threadID_block] = static_cast<double>(threadID_global);

    __syncthreads();
    if (threadID_block == 0){
        a[blockID_global] = 0.0;
        for (size_t index = 0; index < blockDim.x * blockDim.y; index++){
            a[blockID_global] += temp[index];
        }
    }
}

int main(){

    int devNum = 0;
    CUDA_CALL(cudaGetDevice(&devNum));
    CUDA_CALL(cudaSetDevice(devNum));


    dim3 gridSize(2,2,1);
    dim3 blockSize(4,4,1);

    double * dev_a = NULL;
    size_t length = gridSize.x * gridSize.y ;
    size_t byteSize = length * sizeof(double);
    CUDA_CALL(cudaMalloc(&dev_a,byteSize));

    size_t shmem_perBlock = blockSize.x * blockSize.y * sizeof(double);
    kernel <<< gridSize, blockSize,  shmem_perBlock >>> (dev_a);
    CUDA_CALL(cudaGetLastError());
    CUDA_CALL(cudaDeviceSynchronize());

    double * a = new double [length];
    CUDA_CALL(cudaMemcpy(a,dev_a,byteSize,cudaMemcpyDeviceToHost));

    for (size_t index = 0; index < length; index++){
        printf("%.3f\n",a[index]);
    }

    printf("\n");

    CUDA_CALL(cudaFree(dev_a));
    CUDA_CALL(cudaDeviceReset());
    delete[]a;

    system("pause");
    return 0;
}

如果您使用的是开普勒或更高版本,请先阅读以下内容: http://devblogs.nvidia.com/parallelforall/faster-parallel-reductions-kepler/

否则,如果你是开普勒先驱,请阅读以下内容: http://developer.download.nvidia.com/compute/cuda/1.1-Beta/x86_website/projects/reduction/doc/reduction.pdf

您在 CUDA 编程方面缺少一些基础知识。我在下面为您提供了代码模板。这是为了澄清其中一些基本原理。不要指望这会被优化,因为我希望您对并行缩减进行编程。这将使您开始了解如何使用共享内存。

祝你好运!

#include <stdio.h>

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#define N 10000
#define K 100

#define CUDA_CALL(x) do { if((x) != cudaSuccess) { \
    printf("Error at %s:%d\n",__FILE__,__LINE__); \
    printf("%s\n",cudaGetErrorString(x)); \
    system("pause"); \
    return EXIT_FAILURE;}} while(0)

__global__ void kernel(double* a, double* results){

    extern __shared__ double shared[];

    size_t tid, tid_local, stride;
    tid       = blockDim.x*blockIdx.x+threadIdx.x; //thread id within all blocks
    tid_local = threadIdx.x;                      //thread id within a block
    stride    = blockDim.x*gridDim.x;             //total number of threads

    double *start = &a[K*blockIdx.x]; //each block will get K of a block.

    shared[tid_local]=start[tid_local]; //copy K elements into shared memory
    __syncthreads();

    //Perform Parallel reduction, you will have to implement this
    //After parallel reduction, result should be in shared[0]

    //for demonstration I made the code serial for each block on thread 0.
    //This is for demonstration only.
    double sum=0;
    if(tid_local==0){
        for(int i=0; i<K; i++){
            sum+=shared[i];
        }

        a[blockIdx.x]=sum;
    }

}

int main(){

    int devNum = 0;
    CUDA_CALL(cudaGetDevice(&devNum));
    CUDA_CALL(cudaSetDevice(devNum));


    double * dev_a = NULL;
    double * dev_results=NULL;

    CUDA_CALL(cudaMalloc(&dev_a, N*sizeof(double) ));
    CUDA_CALL(cudaMalloc(&dev_results, (N/K)*sizeof(double)));

    //copy dev_a onto GPU (this is the array you are summing).

    dim3 block_size(K,   1, 1);
    dim3 grid_size (N/K, 1, 1);

    size_t shmem_perBlock = K * sizeof(double);

    kernel <<< grid_size, block_size,  shmem_perBlock >>> (dev_a, dev_results);

    CUDA_CALL(cudaGetLastError());
    CUDA_CALL(cudaDeviceSynchronize());

    //copy dev_results back to CPU, this is your result.

    CUDA_CALL(cudaFree(dev_a));
    CUDA_CALL(cudaFree(dev_results));

    system("pause");
    return 0;
}