使用 CUDA __shfl_down 函数在数组及其索引中查找最小值

Finding Minimum Value in Array and its index using CUDA __shfl_down function

我正在编写一个函数,它将使用 CUDA 查找一维数组的最小值和索引。

我首先修改了用于在一维数组中查找值总和的缩减代码。该代码适用于 sum 函数,但我无法找到最小值。我在留言里附上代码如果有cuda高手请指出我做错了

实际功能如下,在测试示例中,数组大小为 1024。因此,我正在使用 shuffle reduction 部分。问题是每个块 g_oIdxs(给出索引)中的输出值,与普通顺序 CPU 代码相比,每个块 g_odata(给出最小值)是错误的。

当我在主机中打印时,g_odata 中的值也全为零 (0)。

提前致谢!

#include <stdio.h>
#include <stdlib.h>
#include <cuda.h>
#include <math.h>
#include <unistd.h>
#include <sys/time.h>

#if __DEVICE_EMULATION__
#define DEBUG_SYNC __syncthreads();
#else
#define DEBUG_SYNC
#endif

#ifndef MIN
#define MIN(x,y) ((x < y) ? x : y)
#endif

#ifndef MIN_IDX
#define MIN_IDX(x,y, idx_x, idx_y) ((x < y) ? idx_x : idx_y)
#endif

#if (__CUDA_ARCH__ < 200)
#define int_mult(x,y)   __mul24(x,y)
#else
#define int_mult(x,y)   x*y
#endif

#define inf 0x7f800000

bool isPow2(unsigned int x)
{
    return ((x&(x-1))==0);
}

unsigned int nextPow2(unsigned int x)
{
    --x;
    x |= x >> 1;
    x |= x >> 2;
    x |= x >> 4;
    x |= x >> 8;
    x |= x >> 16;
    return ++x;
}

// Utility class used to avoid linker errors with extern
// unsized shared memory arrays with templated type
template<class T>
struct SharedMemory {
    __device__ inline operator T *() {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }

    __device__ inline operator const T *() const {
        extern __shared__ int __smem[];
        return (T *) __smem;
    }
};

// specialize for double to avoid unaligned memory
// access compile errors
template<>
struct SharedMemory<double> {
    __device__ inline operator double *() {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }

    __device__ inline operator const double *() const {
        extern __shared__ double __smem_d[];
        return (double *) __smem_d;
    }
};

/*
 This version adds multiple elements per thread sequentially.  This reduces the overall
 cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
 (Brent's Theorem optimization)

 Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
 In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
 If blockSize > 32, allocate blockSize*sizeof(T) bytes.
 */
template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
    T *sdata = SharedMemory<T>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;

    T mySum = 0;

    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        mySum += g_idata[i];

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n)
            mySum += g_idata[i + blockSize];

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = mySum;
    __syncthreads();

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
        sdata[tid] = mySum = mySum + sdata[tid + 256];
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        sdata[tid] = mySum = mySum + sdata[tid + 128];
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        sdata[tid] = mySum = mySum + sdata[tid + 64];
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if (tid < 32) {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >= 64)
            mySum += sdata[tid + 32];
        // Reduce final warp using shuffle
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            mySum += __shfl_down(mySum, offset);
        }
    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >= 64) && (tid < 32))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 32];
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 16];
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 8];
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 4];
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 2];
    }

    __syncthreads();

    if ((blockSize >= 2) && ( tid < 1))
    {
        sdata[tid] = mySum = mySum + sdata[tid + 1];
    }

    __syncthreads();
#endif

    // write result for this block to global mem
    if (tid == 0)
        g_odata[blockIdx.x] = mySum;
}


/*
 This version adds multiple elements per thread sequentially.  This reduces the overall
 cost of the algorithm while keeping the work complexity O(n) and the step complexity O(log n).
 (Brent's Theorem optimization)

 Note, this kernel needs a minimum of 64*sizeof(T) bytes of shared memory.
 In other words if blockSize <= 32, allocate 64*sizeof(T) bytes.
 If blockSize > 32, allocate blockSize*sizeof(T) bytes.
 */
template<class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduceMin6(T *g_idata, int *g_idxs, T *g_odata, int *g_oIdxs, unsigned int n) {
    T *sdata = SharedMemory<T>();

    int *sdataIdx = SharedMemory<int>();

    // perform first level of reduction,
    // reading from global memory, writing to shared memory
    unsigned int tid = threadIdx.x;
    unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
    unsigned int gridSize = blockSize * 2 * gridDim.x;


    T myMin = 99999;
    int myMinIdx = -1;
    // we reduce multiple elements per thread.  The number is determined by the
    // number of active thread blocks (via gridDim).  More blocks will result
    // in a larger gridSize and therefore fewer elements per thread
    while (i < n) {
        myMinIdx  = MIN_IDX(g_idata[i], myMin, g_idxs[i], myMinIdx);
        myMin = MIN(g_idata[i], myMin);

        // ensure we don't read out of bounds -- this is optimized away for powerOf2 sized arrays
        if (nIsPow2 || i + blockSize < n){
            //myMin += g_idata[i + blockSize];
            myMinIdx  = MIN_IDX(g_idata[i + blockSize], myMin, g_idxs[i + blockSize], myMinIdx);
            myMin = MIN(g_idata[i + blockSize], myMin);
        }

        i += gridSize;
    }

    // each thread puts its local sum into shared memory
    sdata[tid] = myMin;
    sdataIdx[tid] = myMinIdx;
    __syncthreads();

    // do reduction in shared mem
    if ((blockSize >= 512) && (tid < 256)) {
        //sdata[tid] = mySum = mySum + sdata[tid + 256];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 256], myMin, sdataIdx[tid + 256], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 256], myMin);
    }

    __syncthreads();

    if ((blockSize >= 256) && (tid < 128)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 128];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 128], myMin, sdataIdx[tid + 128], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 128], myMin);
    }

    __syncthreads();

    if ((blockSize >= 128) && (tid < 64)) {
        //sdata[tid] = myMin = myMin + sdata[tid + 64];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 64], myMin, sdataIdx[tid + 64], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 64], myMin);
    }

    __syncthreads();

#if (__CUDA_ARCH__ >= 300 )
    if (tid < 32) {
        // Fetch final intermediate sum from 2nd warp
        if (blockSize >= 64){
        //myMin += sdata[tid + 32];
            myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
            myMin = MIN(sdata[tid + 32], myMin);
        }
        // Reduce final warp using shuffle
        for (int offset = warpSize / 2; offset > 0; offset /= 2) {
            //myMin += __shfl_down(myMin, offset);
            int tempMyMinIdx = __shfl_down(myMinIdx, offset);
            float tempMyMin = __shfl_down(myMin, offset);

            myMinIdx = MIN_IDX(tempMyMin, myMin, tempMyMinIdx , myMinIdx);
            myMin = MIN(tempMyMin, myMin);
        }

    }
#else
    // fully unroll reduction within a single warp
    if ((blockSize >= 64) && (tid < 32))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 32];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 32], myMin, sdataIdx[tid + 32], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 32], myMin);
    }

    __syncthreads();

    if ((blockSize >= 32) && (tid < 16))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 16];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 16], myMin, sdataIdx[tid + 16], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 16], myMin);
    }

    __syncthreads();

    if ((blockSize >= 16) && (tid < 8))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 8];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 8], myMin, sdataIdx[tid + 8], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 8], myMin);
    }

    __syncthreads();

    if ((blockSize >= 8) && (tid < 4))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 4];

        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 4], myMin, sdataIdx[tid + 4], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 4], myMin);
    }

    __syncthreads();

    if ((blockSize >= 4) && (tid < 2))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 2];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 2], myMin, sdataIdx[tid + 2], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 2], myMin);
    }

    __syncthreads();

    if ((blockSize >= 2) && ( tid < 1))
    {
        //sdata[tid] = myMin = myMin + sdata[tid + 1];
        sdataIdx[tid] = myMinIdx = MIN_IDX(sdata[tid + 1], myMin, sdataIdx[tid + 1], myMinIdx);
        sdata[tid] = myMin = MIN(sdata[tid + 1], myMin);
    }

    __syncthreads();
#endif

    __syncthreads();
    // write result for this block to global mem
    if (tid == 0){
        g_odata[blockIdx.x] = myMin;
        g_oIdxs[blockIdx.x] = myMinIdx;
    }
}

////////////////////////////////////////////////////////////////////////////////
// Compute the number of threads and blocks to use for the given reduction kernel
// For the kernels >= 3, we set threads / block to the minimum of maxThreads and
// n/2. For kernels < 3, we set to the minimum of maxThreads and n.  For kernel
// 6, we observe the maximum specified number of blocks, because each thread in
// that kernel can process a variable number of elements.
////////////////////////////////////////////////////////////////////////////////
void getNumBlocksAndThreads(int whichKernel, int n, int maxBlocks,
        int maxThreads, int &blocks, int &threads) {

    //get device capability, to avoid block/grid size exceed the upper bound
    cudaDeviceProp prop;
    int device;
    cudaGetDevice(&device);
    cudaGetDeviceProperties(&prop, device);

    if (whichKernel < 3) {
        threads = (n < maxThreads) ? nextPow2(n) : maxThreads;
        blocks = (n + threads - 1) / threads;
    } else {
        threads = (n < maxThreads * 2) ? nextPow2((n + 1) / 2) : maxThreads;
        blocks = (n + (threads * 2 - 1)) / (threads * 2);
    }

    if ((float) threads * blocks
            > (float) prop.maxGridSize[0] * prop.maxThreadsPerBlock) {
        printf("n is too large, please choose a smaller number!\n");
    }

    if (blocks > prop.maxGridSize[0]) {
        printf(
                "Grid size <%d> exceeds the device capability <%d>, set block size as %d (original %d)\n",
                blocks, prop.maxGridSize[0], threads * 2, threads);

        blocks /= 2;
        threads *= 2;
    }

    if (whichKernel == 6) {
        blocks = MIN(maxBlocks, blocks);
    }
}

////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
template<class T>
void reduce(int size, int threads, int blocks, int whichKernel, T *d_idata,
        T *d_odata) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize =
            (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

    if (isPow2(size)) {
        switch (threads) {
        case 512:
            reduce6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 256:
            reduce6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 128:
            reduce6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 64:
            reduce6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 32:
            reduce6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 16:
            reduce6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 8:
            reduce6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 4:
            reduce6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 2:
            reduce6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 1:
            reduce6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;
        }
    } else {
        switch (threads) {
        case 512:
            reduce6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 256:
            reduce6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 128:
            reduce6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 64:
            reduce6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 32:
            reduce6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 16:
            reduce6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 8:
            reduce6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 4:
            reduce6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 2:
            reduce6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;

        case 1:
            reduce6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata,
                    d_odata, size);
            break;
        }
    }

}


////////////////////////////////////////////////////////////////////////////////
// Wrapper function for kernel launch
////////////////////////////////////////////////////////////////////////////////
template<class T>
void reduceMin(int size, int threads, int blocks, int whichKernel, T *d_idata,
        T *d_odata, int *idxs, int *oIdxs) {
    dim3 dimBlock(threads, 1, 1);
    dim3 dimGrid(blocks, 1, 1);

    // when there is only one warp per block, we need to allocate two warps
    // worth of shared memory so that we don't index shared memory out of bounds
    int smemSize =
            (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

    if (isPow2(size)) {
        switch (threads) {
        case 512:
            reduceMin6<T, 512, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, true> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    } else {
        switch (threads) {
        case 512:
            reduceMin6<T, 512, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 256:
            reduceMin6<T, 256, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 128:
            reduceMin6<T, 128, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 64:
            reduceMin6<T, 64, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 32:
            reduceMin6<T, 32, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 16:
            reduceMin6<T, 16, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 8:
            reduceMin6<T, 8, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 4:
            reduceMin6<T, 4, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 2:
            reduceMin6<T, 2, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;

        case 1:
            reduceMin6<T, 1, false> <<<dimGrid, dimBlock, smemSize>>>(d_idata, idxs,
                    d_odata, oIdxs, size);
            break;
        }
    }

}

////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data       pointer to input data
//! @param size       number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
void reduceMINCPU(T *data, int size, T *min, int *idx)
{
    *min = data[0];
    int min_idx = 0;
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i];
        T t = MIN(*min, y);
        min_idx = MIN_IDX(*min, y, min_idx, i);
        (*min) = t;
    }

    *idx = min_idx;

    return;
}

////////////////////////////////////////////////////////////////////////////////
//! Compute sum reduction on CPU
//! We use Kahan summation for an accurate sum of large arrays.
//! http://en.wikipedia.org/wiki/Kahan_summation_algorithm
//!
//! @param data       pointer to input data
//! @param size       number of input data elements
////////////////////////////////////////////////////////////////////////////////
template<class T>
T reduceCPU(T *data, int size)
{
    T sum = data[0];
    T c = (T)0.0;

    for (int i = 1; i < size; i++)
    {
        T y = data[i] - c;
        T t = sum + y;
        c = (t - sum) - y;
        sum = t;
    }

    return sum;
}

// Instantiate the reduction function for 3 types
template void
reduce<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
        int *d_odata);

template void
reduce<float>(int size, int threads, int blocks, int whichKernel,
        float *d_idata, float *d_odata);

template void
reduce<double>(int size, int threads, int blocks, int whichKernel,
        double *d_idata, double *d_odata);

// Instantiate the reduction function for 3 types
template void
reduceMin<int>(int size, int threads, int blocks, int whichKernel, int *d_idata,
        int *d_odata, int *idxs, int *oIdxs);

template void
reduceMin<float>(int size, int threads, int blocks, int whichKernel, float *d_idata,
        float *d_odata, int *idxs, int *oIdxs);

template void
reduceMin<double>(int size, int threads, int blocks, int whichKernel, double *d_idata,
        double *d_odata, int *idxs, int *oIdxs);

unsigned long long int my_min_max_test(int num_els) {

    // timers

    unsigned long long int start;
    unsigned long long int delta;

    int maxThreads = 256;  // number of threads per block
    int whichKernel = 6;
    int maxBlocks = 64;

    int testIterations = 100;



    float* d_in = NULL;
    float* d_out = NULL;
    int *d_idxs = NULL;
    int *d_oIdxs = NULL;

    printf("%d elements\n", num_els);
    printf("%d threads (max)\n", maxThreads);

    int numBlocks = 0;
    int numThreads = 0;
    getNumBlocksAndThreads(whichKernel, num_els, maxBlocks, maxThreads, numBlocks,
            numThreads);



    //  in[1024] = 34.0f;
    //  in[333] = 55.0f;
    //  in[23523] = -42.0f;

//  cudaMalloc((void**) &d_in, size);
//  cudaMalloc((void**) &d_out, size);
//  cudaMalloc((void**) &d_idxs, num_els * sizeof(int));

    cudaMalloc((void **) &d_in, num_els * sizeof(float));
    cudaMalloc((void **) &d_idxs, num_els * sizeof(int));
    cudaMalloc((void **) &d_out, numBlocks * sizeof(float));
    cudaMalloc((void **) &d_oIdxs, numBlocks * sizeof(int));

    float* in = (float*) malloc(num_els * sizeof(float));
    int *idxs = (int*) malloc(num_els * sizeof(int));
    float* out = (float*) malloc(numBlocks * sizeof(float));
    int* oIdxs = (int*) malloc(numBlocks * sizeof(int));

    for (int i = 0; i < num_els; i++) {
        in[i] = (double) rand() / (double) RAND_MAX;
        idxs[i] = i;
    }

    for (int i = 0; i < num_els; i++) {

        printf("\n [%d] = %.4f", idxs[i], in[i]);
    }

    // copy data directly to device memory
    cudaMemcpy(d_in, in, num_els * sizeof(float), cudaMemcpyHostToDevice);
    cudaMemcpy(d_idxs, idxs, num_els * sizeof(int),cudaMemcpyHostToDevice);
    cudaMemcpy(d_out, out, numBlocks * sizeof(float),cudaMemcpyHostToDevice);
    cudaMemcpy(d_oIdxs, oIdxs, numBlocks * sizeof(int),cudaMemcpyHostToDevice);

    // warm-up
//  reduce<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out);
//
//  cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
//
//  for(int i=0; i< numBlocks; i++)
//  printf("\nFinal Result[BLK:%d]: %f", i, out[i]);

//  printf("\n Reduce CPU : %f", reduceCPU<float>(in, num_els));

    reduceMin<float>(num_els, numThreads, numBlocks, whichKernel, d_in, d_out, d_idxs, d_oIdxs);

    cudaMemcpy(out, d_out, numBlocks * sizeof(float), cudaMemcpyDeviceToHost);
    cudaMemcpy(oIdxs, d_oIdxs, numBlocks * sizeof(int), cudaMemcpyDeviceToHost);

    for(int i=0; i< numBlocks; i++)
        printf("\n Reduce MIN GPU idx: %d  value: %f", oIdxs[i], out[i]);


    int min_idx;
    float min;
    reduceMINCPU<float>(in, num_els, &min, &min_idx);

    printf("\n\n Reduce MIN CPU idx: %d  value: %f", min_idx, min);

    cudaFree(d_in);
    cudaFree(d_out);
    cudaFree(d_idxs);

    free(in);
    free(out);

    //system("pause");

    return delta;

}

int main(int argc, char* argv[]) {

    printf(" GTS250 @ 70.6 GB/s - Finding min and max");
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");

//#pragma unroll
//for(int i = 1024*1024; i <= 32*1024*1024; i=i*2)
//{
//  my_min_max_test(i);
//}

    printf("\n Non-base 2 tests! \n");
    printf("\n N \t\t [GB/s] \t [perc] \t [usec] \t test \n");

    my_min_max_test(1024);



// just some large numbers....
//my_min_max_test(14*1024*1024+38);
//my_min_max_test(14*1024*1024+55);
//my_min_max_test(18*1024*1024+1232);
//my_min_max_test(7*1024*1024+94854);

//for(int i = 0; i < 4; i++)
//{
//
//  float ratio = float(rand())/float(RAND_MAX);
//  ratio = ratio >= 0 ? ratio : -ratio;
//  int big_num = ratio*18*1e6;
//
//  my_min_max_test(big_num);
//}

    return 0;
}

这不是使用动态分配的共享内存的有效方式:

T *sdata = SharedMemory<T>();

int *sdataIdx = SharedMemory<int>();

这两个指针(sdatasdataIdx)最终将指向 相同的 位置。 (documentation 讨论了如何正确处理这个问题。)

即使您现在想要使用两倍的共享内存(用于存储最小加索引),与之前的仅减少总和的代码相比,您还没有增加动态分配大小:

int smemSize =
        (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);

在你的例子中,threads 是 256,你分配了足够的 (threads*sizeof(T)) 用于最小存储,但没有分配给索引存储。

我们可以通过进行以下更改来“解决”这些问题:

T *sdata = SharedMemory<T>();

int *sdataIdx = (int *)(sdata + blockSize);

和:

int smemSize =
        (threads <= 32) ? 2 * threads * sizeof(T) : threads * sizeof(T);
smemSize += threads*sizeof(int);

通过这些更改,您的代码会产生预期的输出:

$ cuda-memcheck ./t1182
========= CUDA-MEMCHECK
 GTS250 @ 70.6 GB/s - Finding min and max
 N    [GB/s]   [perc]   [usec]   test

 Non-base 2 tests!

 N    [GB/s]   [perc]   [usec]   test
1024 elements
256 threads (max)

 Reduce MIN GPU idx: 484  value: 0.001125
 Reduce MIN GPU idx: 972  value: 0.002828

 Reduce MIN CPU idx: 484  value: 0.001125
========= ERROR SUMMARY: 0 errors
$

(我注释掉了打印出所有 1024 个初始值的循环)