CUDA 不等待内核完成

Question

所以当我尝试在这样的循环中启动一些内核时遇到了非法内存访问错误：

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

然后我调试，发现这个'extractBits'函数计算错误。当我尝试打印以查看问题时，错误很少发生。所以我意识到我添加的延迟越多，发生错误的机会就越少。我添加了这样的东西，它运行没有错误：

for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);

        // Dummy copy to delay, do nothing.
        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

这是源代码（它只能运行在colab上使用像T4这样的gpu，P100不能）：

#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)\
{\
    const cudaError_t error = call;\
    if (error != cudaSuccess)\
    {\
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);\
        fprintf(stderr, "code: %d, reason: %s\n", error,\
                cudaGetErrorString(error));\
        exit(1);\
    }\
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}

__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

    int gridSize = (n - 1) / blockSize + 1;

    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst 
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore       
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());
        
        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));
    

    // Free memory  
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void sort(const uint32_t * in, int n, 
        uint32_t * out, 
        bool useDevice=false, int blockSize=1)
{
    GpuTimer timer; 
    timer.Start();

    if (useDevice == false)
    {
        printf("\nRadix Sort by host\n");
        sortByHost(in, n, out);
    }
    else // use device
    {
        printf("\nRadix Sort by device\n");
        sortByDevice(in, n, out, blockSize);
    }

    timer.Stop();
    printf("Time: %.3f ms\n", timer.Elapsed());
}

void printDeviceInfo()
{
    cudaDeviceProp devProv;
    CHECK(cudaGetDeviceProperties(&devProv, 0));
    printf("**********GPU info**********\n");
    printf("Name: %s\n", devProv.name);
    printf("Compute capability: %d.%d\n", devProv.major, devProv.minor);
    printf("Num SMs: %d\n", devProv.multiProcessorCount);
    printf("Max num threads per SM: %d\n", devProv.maxThreadsPerMultiProcessor); 
    printf("Max num warps per SM: %d\n", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
    printf("GMEM: %zu byte\n", devProv.totalGlobalMem);
    printf("SMEM per SM: %zu byte\n", devProv.sharedMemPerMultiprocessor);
    printf("SMEM per block: %zu byte\n", devProv.sharedMemPerBlock);
    printf("****************************\n");
}

void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
    for (int i = 0; i < n; i++)
    {
        if (out[i] != correctOut[i])
        {
            printf("INCORRECT :(\n");
            return;
        }
    }
    printf("CORRECT :)\n");
}

void printArray(uint32_t * a, int n)
{
    for (int i = 0; i < n; i++)
        printf("%i ", a[i]);
    printf("\n");
}

int main(int argc, char ** argv)
{
    // PRINT OUT DEVICE INFO
    printDeviceInfo();

    // SET UP INPUT SIZE
    //int n = 50; // For test by eye
    int n = (1 << 24) + 1;
    printf("\nInput size: %d\n", n);

    // ALLOCATE MEMORIES
    size_t bytes = n * sizeof(uint32_t);
    uint32_t * in = (uint32_t *)malloc(bytes);
    uint32_t * out = (uint32_t *)malloc(bytes); // Device result
    uint32_t * correctOut = (uint32_t *)malloc(bytes); // Host result

    // SET UP INPUT DATA
    for (int i = 0; i < n; i++)
    {
        //in[i] = rand() % 255; // For test by eye
        in[i] = rand();
    }
    //printArray(in, n); // For test by eye

    // DETERMINE BLOCK SIZE
    int blockSize = 256; // Default 
    if (argc == 2)
        blockSize = atoi(argv[1]);

    // SORT BY HOST
    sort(in, n, correctOut);
    //printArray(correctOut, n); // For test by eye
    
    // SORT BY DEVICE
    sort(in, n, out, true, blockSize);
    //printArray(out, n); // For test by eye
    checkCorrectness(out, correctOut, n);

    // FREE MEMORIES
    free(in);
    free(out);
    free(correctOut);
    
    return EXIT_SUCCESS;
}

那么我的代码有什么问题，有人可以为我解释一下吗？

Answer 1

此代码可能存在多个问题。我将列出 3 个，然后给出一些额外的评论。

您非法使用了__syncthreads():

 if (i < n)   <--------------------------------------
 {                                                   |
     if (i == 0)                                     |
         s_data[threadIdx.x] = 0;                    |
     else                                            |
         s_data[threadIdx.x] = in[i - 1];            |
     __syncthreads();  <-----------------------------

参见 here。不过，我认为这不是核心问题。

我有理由相信，为了正确起见，您需要 __threadfence() 这里：

     if (threadIdx.x == 0)
     {
         bSums[blockId] = s_data[blockDim.x - 1];
         __threadfence();  // added
         if (blockId > 0)

处理block 0和block 1之间的交互。

根据我的测试，核心问题是您没有正确考虑全局内存竞争条件。让我们考虑这段代码：

         // 3. Each block write results from SMEM to GMEM
         out[i] = s_data[threadIdx.x];  // line A

         if (bSums != NULL)
         {
             if (threadIdx.x == 0)
             {
                 bSums[blockId] = s_data[blockDim.x - 1];

                 if (blockId > 0)
                 {
                     while (bDoneCount < blockId) {}
                     bSums[blockId] += bSums[blockId - 1];
                     __threadfence();
                 }
                 bDoneCount += 1;
             }
             __syncthreads();

             if (i + blockDim.x < n)
                 out[i + blockDim.x] += bSums[blockId];  // line B
         }
     }
 }

我在其中标记了 A 行和 B 行。为了正确起见，这要求对于任何给定的块 X，块 X+1 必须在块 X 执行行 B 之前执行行 A。我什么也没看到强制执行。

我写了下面的测试代码，主要是为了测试上面第3条的断言。关键的添加是要求任何块在下一个更高编号的块通过 A 行之前不继续到 B 行。由于您已经有一个可用于此目的的 bDoneCount 计数器，我将其重新用于这种粗糙的块间同步。一些注意事项：

我完全不推荐这种编程方法。
我并不是说我在这里展示的任何东西都是正确的。这主要是您的代码，我的目标是证明上面第 3 项中的断言。
是的，我添加的这种额外的块到块同步会导致您的整体代码运行很多更慢。就我而言，我不关心这个，因为 none 这里的工作是编写高性能基数排序的明智方法。
如果你关心高性能，你不应该写你自己的扫描内核，至少不是你这里的那个
如果您关心高性能，则应该使用库实现，可以是排序的库实现，或者至少是前缀和的库实现。这里的前缀和不是高性能实现。

考虑到这些注意事项，这里是对您的代码进行的测试代码修改，解决了我列出的 3 个项目中的一些问题。它仍然没有解决同步线程的不当使用。然而，一旦您了解了潜在的问题和要求，这是一个相当机械的修复。下面的代码通过了我想对它进行的所有测试。您会发现超出我描述的其他修改，主要是为了方便我的测试和问题可见性：

$ cat t109.cu
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <stdint.h>
#include <string.h>
#include <stdlib.h>

#define CHECK(call)\
{\
    const cudaError_t error = call;\
    if (error != cudaSuccess)\
    {\
        fprintf(stderr, "Error: %s:%d, ", __FILE__, __LINE__);\
        fprintf(stderr, "code: %d, reason: %s\n", error,\
                cudaGetErrorString(error));\
        exit(1);\
    }\
}

struct GpuTimer
{
    cudaEvent_t start;
    cudaEvent_t stop;

    GpuTimer()
    {
        cudaEventCreate(&start);
        cudaEventCreate(&stop);
    }

    ~GpuTimer()
    {
        cudaEventDestroy(start);
        cudaEventDestroy(stop);
    }

    void Start()
    {
        cudaEventRecord(start, 0);
        cudaEventSynchronize(start);
    }

    void Stop()
    {
        cudaEventRecord(stop, 0);
    }

    float Elapsed()
    {
        float elapsed;
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&elapsed, start, stop);
        return elapsed;
    }
};

// Sequential Radix Sort
// "const uint32_t * in" means: the memory region pointed by "in" is read-only
void sortByHost(const uint32_t * in, int n,
                uint32_t * out)
{
    int * bits = (int *)malloc(n * sizeof(int));
    int * nOnesBefore = (int *)malloc(n * sizeof(int));

    uint32_t * src = (uint32_t *)malloc(n * sizeof(uint32_t));
    uint32_t * originalSrc = src; // To free memory later
    memcpy(src, in, n * sizeof(uint32_t));
    uint32_t * dst = out;

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitIdx = 0; bitIdx < sizeof(uint32_t) * 8; bitIdx++)
    {
        // Extract bits
        for (int i = 0; i < n; i++)
            bits[i] = (src[i] >> bitIdx) & 1;

        // Compute nOnesBefore
        nOnesBefore[0] = 0;
        for (int i = 1; i < n; i++)
            nOnesBefore[i] = nOnesBefore[i-1] + bits[i-1];

        // Compute rank and write to dst
        int nZeros = n - nOnesBefore[n-1] - bits[n-1];
        for (int i = 0; i < n; i++)
        {
            int rank;
            if (bits[i] == 0)
                rank = i - nOnesBefore[i];
            else
                rank = nZeros + nOnesBefore[i];
            dst[rank] = src[i];
        }

        // Swap src and dst
        uint32_t * temp = src;
        src = dst;
        dst = temp;
    }

    // Does out array contain results?
    memcpy(out, src, n * sizeof(uint32_t));

    // Free memory
    free(originalSrc);
    free(bits);
    free(nOnesBefore);
}

__global__ void extractBits(uint32_t* in, int n, int* out, int bitId)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    if (i < n)
        out[i] = (in[i] >> bitId) & 1;
}

__device__ int bCount = 0;
volatile __device__ int bDoneCount = 0;

__global__ void scanKernel(int* in, int n, int* out, volatile int* bSums)
{
    __shared__ int blockId;
    if (threadIdx.x == 0)
    {
        blockId = atomicAdd(&bCount, 1);
    }
    __syncthreads();

    // 1. Each block loads data from GMEM to SMEM
    extern __shared__ int s_data[];

    int i = blockId * blockDim.x + threadIdx.x;

    if (i < n)
    {
        if (i == 0)
            s_data[threadIdx.x] = 0;
        else
            s_data[threadIdx.x] = in[i - 1];
        __syncthreads();

        // 2. Each block does scan with data on SMEM
        for (int stride = 1; stride < blockDim.x; stride *= 2)
        {
            int neededVal;
            if (threadIdx.x >= stride)
                neededVal = s_data[threadIdx.x - stride];
            __syncthreads();
            if (threadIdx.x >= stride)
                s_data[threadIdx.x] += neededVal;
            __syncthreads();
        }

        // 3. Each block write results from SMEM to GMEM
        out[i] = s_data[threadIdx.x];

        if (bSums != NULL)
        {
            if (threadIdx.x == 0)
            {
                bSums[blockId] = s_data[blockDim.x - 1];
                __threadfence();

                if (blockId > 0)
                {
                    while (bDoneCount < blockId) {}
                    bSums[blockId] += bSums[blockId - 1];
                    __threadfence();
                }
                bDoneCount += 1;
            }
            if (blockId < (gridDim.x-1)) {while (bDoneCount < (blockId+2)){};} // ADDED SYNC
            __syncthreads();

            if (i + blockDim.x < n)
                out[i + blockDim.x] += bSums[blockId];
        }
    }
}
__global__ void computeRank(uint32_t* in, int n, uint32_t* out, int* bits, int* nOnesBefore)
{
    int i = blockIdx.x * blockDim.x + threadIdx.x;
    int nZeros = n - nOnesBefore[n - 1] - bits[n - 1];

    if (i < n)
    {
        int rank;
        if (bits[i] == 0)
            rank = i - nOnesBefore[i];
        else
            rank = nZeros + nOnesBefore[i];
        out[rank] = in[i];
    }
}

// Parallel Radix Sort
void sortByDevice(const uint32_t * in, int n, uint32_t * out, int blockSize)
{
    uint32_t* src = (uint32_t*)malloc(n * sizeof(uint32_t));
    // TODO
    int *d_bits, *d_nOnesBefore;
    size_t nBytes = n * sizeof(int);
    CHECK(cudaMalloc(&d_bits, nBytes));
    CHECK(cudaMalloc(&d_nOnesBefore, nBytes));

    uint32_t * d_in, * d_out;
    nBytes = n * sizeof(uint32_t);
    CHECK(cudaMalloc(&d_in, nBytes));
    CHECK(cudaMalloc(&d_out, nBytes));

    CHECK(cudaMemcpy(d_in, in, nBytes, cudaMemcpyHostToDevice));

//    int gridSize = (n - 1) / blockSize + 1;
    int gridSize = (n+blockSize-1)/blockSize;
    int* d_bSums;
    if (gridSize > 1)
    {
        CHECK(cudaMalloc(&d_bSums, gridSize * sizeof(int)));
    }
    else
    {
        d_bSums = NULL;
    }
    size_t smem = blockSize * sizeof(int);

    // Loop from LSB (Least Significant Bit) to MSB (Most Significant Bit)
    // In each loop, sort elements according to the current bit from src to dst
    // (using STABLE counting sort)
    for (int bitId = 0; bitId < sizeof(uint32_t) * 8; bitId++)
    {
        // Extract bits
        extractBits <<< gridSize, blockSize >>> (d_in, n, d_bits, bitId);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

    //    CHECK(cudaMemcpy(src, d_in, sizeof(uint32_t) * n, cudaMemcpyDeviceToHost));

        // Compute nOnesBefore
        scanKernel <<< gridSize, blockSize, smem >>> (d_bits, n, d_nOnesBefore, d_bSums);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        int zero = 0;
        CHECK(cudaMemcpyToSymbol(bCount, &zero, sizeof(int)));
        CHECK(cudaMemcpyToSymbol(bDoneCount, &zero, sizeof(int)));

        // Compute rank and write to d_out
        computeRank <<< gridSize, blockSize >>> (d_in, n, d_out, d_bits, d_nOnesBefore);
        cudaDeviceSynchronize();
        CHECK(cudaGetLastError());

        // Swap d_in and d_out
        uint32_t* temp = d_in;
        d_in = d_out;
        d_out = temp;
    }

    CHECK(cudaMemcpy(out, d_in, nBytes, cudaMemcpyDeviceToHost));


    // Free memory
    CHECK(cudaFree(d_bits));
    CHECK(cudaFree(d_nOnesBefore));
    CHECK(cudaFree(d_in));
    CHECK(cudaFree(d_out));

    if (gridSize > 1)
        CHECK(cudaFree(d_bSums));

    free(src);
}

// Radix Sort
void sort(const uint32_t * in, int n,
        uint32_t * out,
        bool useDevice=false, int blockSize=1)
{
    GpuTimer timer;
    timer.Start();

    if (useDevice == false)
    {
        printf("\nRadix Sort by host\n");
        sortByHost(in, n, out);
    }
    else // use device
    {
        printf("\nRadix Sort by device\n");
        sortByDevice(in, n, out, blockSize);
    }

    timer.Stop();
    printf("Time: %.3f ms\n", timer.Elapsed());
}

void printDeviceInfo()
{
    cudaDeviceProp devProv;
    CHECK(cudaGetDeviceProperties(&devProv, 0));
    printf("**********GPU info**********\n");
    printf("Name: %s\n", devProv.name);
    printf("Compute capability: %d.%d\n", devProv.major, devProv.minor);
    printf("Num SMs: %d\n", devProv.multiProcessorCount);
    printf("Max num threads per SM: %d\n", devProv.maxThreadsPerMultiProcessor);
    printf("Max num warps per SM: %d\n", devProv.maxThreadsPerMultiProcessor / devProv.warpSize);
    printf("GMEM: %zu byte\n", devProv.totalGlobalMem);
    printf("SMEM per SM: %zu byte\n", devProv.sharedMemPerMultiprocessor);
    printf("SMEM per block: %zu byte\n", devProv.sharedMemPerBlock);
    printf("****************************\n");
}

void checkCorrectness(uint32_t * out, uint32_t * correctOut, int n)
{
    for (int i = 0; i < n; i++)
    {
        if (out[i] != correctOut[i])
        {
            printf("INCORRECT : index: %d, was: %u, should be: %u\n", i, out[i], correctOut[i]);
            return;
        }
    }
    printf("CORRECT :)\n");
}

void printArray(uint32_t * a, int n)
{
    for (int i = 0; i < n; i++)
        printf("%i ", a[i]);
    printf("\n");
}
bool is_sorted(uint32_t *data, int n){
        for (int i = 1; i < n; i++)
          if (data[i-1] > data[i]) return false;
        return true;
}
int main(int argc, char ** argv)
{
    // PRINT OUT DEVICE INFO
    printDeviceInfo();

    // SET UP INPUT SIZE
    //int n = 50; // For test by eye
    int n = (1 << 24)+ 1;
    if (argc > 1)  n = (1 << atoi(argv[1]));
    printf("\nInput size: %d\n", n);

    // ALLOCATE MEMORIES
    size_t bytes = n * sizeof(uint32_t);
    uint32_t * in = (uint32_t *)malloc(bytes);
    uint32_t * out = (uint32_t *)malloc(bytes); // Device result
    uint32_t * correctOut = (uint32_t *)malloc(bytes); // Host result

    // SET UP INPUT DATA
    for (int i = 0; i < n; i++)
    {
        //in[i] = rand() % 255; // For test by eye
        in[i] = rand()%n;
    }
    //printArray(in, n); // For test by eye

    // DETERMINE BLOCK SIZE
    int blockSize = 256; // Default
#if 0
    if (argc == 2)
        blockSize = atoi(argv[1]);
#endif
    // SORT BY HOST
    sort(in, n, correctOut);
    //printArray(correctOut, n); // For test by eye
    if (!is_sorted(correctOut, n)) printf("host sorting error\n");
    // SORT BY DEVICE
    sort(in, n, out, true, blockSize);
    if (!is_sorted(out, n)) printf("device sorting error\n");
    //printArray(out, n); // For test by eye
    checkCorrectness(out, correctOut, n);

    // FREE MEMORIES
    free(in);
    free(out);
    free(correctOut);

    return EXIT_SUCCESS;
}
$ nvcc -o t109 t109.cu
$ ./t109
**********GPU info**********
Name: GeForce GTX 960
Compute capability: 5.2
Num SMs: 8
Max num threads per SM: 2048
Max num warps per SM: 64
GMEM: 2099052544 byte
SMEM per SM: 98304 byte
SMEM per block: 49152 byte
****************************

Input size: 16777217

Radix Sort by host
Time: 9495.082 ms

Radix Sort by device
Time: 11949.311 ms
CORRECT :)
$

(CUDA 11.1, Fedora 29, GTX960)

您可能还有问题。几点建议：

确保您没有运行在有内核超时的机器上运行。我认为这些内核中的任何一个都不应该运行 2 秒或更长时间，但请参阅下面的第 2 项。您可以使用 deviceQuery 示例代码判断内核超时是否有效。
请确保，如果在 windows 上，您正在编译一个发布项目，而不是调试项目。编译调试项目（或使用 -G 编译，无论是在 windows 还是 linux 上）都会使您的内核运行变慢。

CUDA 不等待内核完成

CUDA not waiting for kernel to finish

cuda