将非方形图像存储在一个卷中

Storing non-square images in a volume

我一直在尝试将非正方形图像存储到一个卷中。当我想存储大小为 w512 x h512 的方形图像 512 次时,我的代码有效,但它不适用于大小为 w512 x h1024 的非方形图像。我在这一行收到 unspecified launch failure 描述的错误:gpuErrchk( cudaDeviceSynchronize() ); 我不确定为什么会这样?我试图设置我的代码,以便它以合并的方式访问数据。我的问题有替代解决方案吗?

这是我的内核:

__global__ void 
copySlice2Volume2(float *buffer, float *slice, int height, int width, int frameIdx) 
{

    int tid = (blockIdx.x * width) + threadIdx.x;
    buffer[tid + (frameIdx*width*height)] = slice[tid]; 

    __syncthreads();

}

下面是调用上面内核的函数:

void testStorage() {

    int nFrames = 512;

    int width = 0;
    int height = 0;

    // 8-bit unsigned char images
    Mat frame, floatFrame;

    frame = imread("C:/Matlab code/im.png", CV_LOAD_IMAGE_GRAYSCALE); 

    // convert uchar images to float images
    frame.convertTo(floatFrame, CV_32F, 1.0/255.0f);

    width = frame.step;
    height = frame.rows;

    cout << "width: " << width << " height: " << height << endl;
    float *gpuBuffer;
    float *testImage;

    gpuErrchk( cudaMalloc( (void**) &gpuBuffer, sizeof(float) * width * height * nFrames));         // storage init for buffer
    gpuErrchk( cudaMemset(gpuBuffer, 0, sizeof(float) * width * height * nFrames)); // set mem to 0 

    gpuErrchk( cudaMalloc( (void**) &testImage, sizeof(float) * width * height ));          // storage init for image
    gpuErrchk( cudaMemset(testImage, 0, sizeof(float) * width * height ));  // set mem to 0

    gpuErrchk( cudaMemcpy( testImage, floatFrame.ptr<float>(), sizeof (float) * width * height, cudaMemcpyHostToDevice) );

    // num of threads
    dim3 Threads(width); 
    // num of blocks
    dim3 Blocks(height); 

    for(int i = 0; i < nFrames; i++)
    {
        copySlice2Volume2<<< Blocks, Threads >>> (gpuBuffer, testImage, width, height, i);  

    }

    gpuErrchk( cudaDeviceSynchronize() );    // error here

    printf("Cuda status2: %s\n", cudaGetErrorString( cudaGetLastError() ) );

    gpuErrchk( cudaFree(gpuBuffer) );
    gpuErrchk( cudaFree(testImage) );

    }

在内核调用中交换了宽度和高度参数,这就是错误的原因。一旦纠正,一切正常。这是更正后的内核:

__global__ void 
copySlice2Volume2(float *buffer, float *slice, int width, int height, int frameIdx) 
{

    int tid = (blockIdx.x * width) + threadIdx.x;
    buffer[tid + (frameIdx*width*height)] = slice[tid]; 

    __syncthreads();

}