Cuda Memory access error : CudaIllegalAddress , Image Processing(Stereo vision)

Cuda Memory access error : CudaIllegalAddress , Image Processing(Stereo vision)

I'm using cuda to deal with image proccessing. but my result is always get 'cudaErrorIllegalAddress : an illegal memory access was encountered'

下面是我做的。

首先,将转换后的图像(rgb 到灰色)加载到设备,我使用 'cudaMallocPitch' 和 'cudaMemcpy2D'

unsigned char *dev_srcleft;
size_t dev_srcleftPitch
cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS * sizeof(int), ROWS));
cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
    COLS, ROWS, cudaMemcpyHostToDevice);

并且,为存储结果分配二维数组。结果值被描述为 27 位,所以我正在尝试使用 'int',即 4bytes=32bits,不仅为了足够的大小,还需要原子操作(atomicOr,atomicXor)来提高性能。 而且我的设备不支持 64 位原子操作。

int *dev_leftTrans;
cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
cudaMemset2D(dev_leftTrans, dev_leftTransPitch, 0, COLS, ROWS);

内存分配和 memcpy2D 工作得很好,我通过

检查
Mat temp_output(ROWS, COLS, 0);
cudaMemcpy2D(temp_output.data, temp_output.step, dev_srcleft, dev_srcleftPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
imshow("temp", temp_output);

然后,做内核代码。

__global__ void TestKernel(unsigned char *src, size_t src_pitch, 
                                     int *dst, size_t dst_pitch,
                            unsigned int COLS, unsigned int ROWS)
{
    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

    unsigned char src_val = src[x + y * src_pitch];
    dst[x + y * dst_pitch] = src_val;
}

dim3 dimblock(3, 3);
dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS /  dimblock.y));
TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
    (dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);

参数COLS和ROWS为图片大小。 我认为错误发生在这里:TestKerenl.

src_val,从全局内存读取效果很好,但是当我尝试访问 dst 时,它因 cudaErrorIllegalAddress

而爆炸

不知道怎么回事,折腾了4天。请帮助我

下面是我的完整代码

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>
#include <device_functions.h>
#include <cuda_device_runtime_api.h>
#include <device_launch_parameters.h>
#include <math.h>
#include <iostream>
#include <opencv2\opencv.hpp>
#include<string>



#define HANDLE_ERROR(err)(HandleError(err, __FILE__, __LINE__))
static void HandleError(cudaError_t err, const char*file, int line)
{
    if (err != cudaSuccess)
    {
        printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
        exit(EXIT_FAILURE);
    }
}
using namespace std;
using namespace cv;

string imagePath = "Ted";
string imagePathL = imagePath + "imL.png";
string imagePathR = imagePath + "imR.png";


__global__ void TestKernel(unsigned char*src, size_t src_pitch,
                       int *dst, size_t dst_pitch,
                       unsigned int COLS, unsigned int ROWS)
{
    const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
    const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;
    if ((COLS< x) && (ROWS < y)) return;

    unsigned char src_val = src[x + y * src_pitch];
    dst[x + y * dst_pitch] = src_val;
}

int main(void)
{
    //Print_DeviceProperty();
    //Left Image Load
    Mat host_srcImgL = imread(imagePathL, CV_LOAD_IMAGE_UNCHANGED);
    if (host_srcImgL.empty()){ cout << "Left Image Load Fail!" << endl;     return; }
    Mat host_srcConvertL;
    cvtColor(host_srcImgL, host_srcConvertL, CV_BGR2GRAY);

    //Right Image Load
    Mat host_srcImgR = imread(imagePathR, CV_LOAD_IMAGE_UNCHANGED);
    if (host_srcImgL.empty()){ cout << "Right Image Load Fail!" << endl; return; }
    Mat host_srcConvertR;
    cvtColor(host_srcImgR, host_srcConvertR, CV_BGR2GRAY);

    //Create parameters
    unsigned int COLS = host_srcConvertL.cols;
    unsigned int ROWS = host_srcConvertR.rows;
    unsigned int SIZE = COLS * ROWS;
    imshow("Left source image", host_srcConvertL);
    imshow("Right source image", host_srcConvertR);

    unsigned char *dev_srcleft, *dev_srcright, *dev_disp;
    int *dev_leftTrans, *dev_rightTrans;
    size_t dev_srcleftPitch, dev_srcrightPitch, dev_dispPitch, dev_leftTransPitch, dev_rightTransPitch;
    cudaMallocPitch((void**)&dev_srcleft, &dev_srcleftPitch, COLS, ROWS);
    cudaMallocPitch((void**)&dev_srcright, &dev_srcrightPitch, COLS, ROWS);
    cudaMallocPitch((void**)&dev_disp, &dev_dispPitch, COLS, ROWS);
    cudaMallocPitch((void**)&dev_leftTrans, &dev_leftTransPitch, COLS * sizeof(int), ROWS);
    cudaMallocPitch((void**)&dev_rightTrans, &dev_rightTransPitch, COLS * sizeof(int), ROWS);

    cudaMemcpy2D(dev_srcleft, dev_srcleftPitch, host_srcConvertL.data, host_srcConvertL.step,
    COLS, ROWS, cudaMemcpyHostToDevice);
    cudaMemcpy2D(dev_srcright, dev_srcrightPitch, host_srcConvertR.data, host_srcConvertR.step,
    COLS, ROWS, cudaMemcpyHostToDevice);
    cudaMemset(dev_disp, 255, dev_dispPitch * ROWS);

    dim3 dimblock(3, 3);
    dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS / dimblock.y));




    cudaEvent_t start, stop;
    float elapsedtime;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
    (dev_srcleft, dev_srcleftPitch, dev_leftTrans, dev_leftTransPitch, COLS, ROWS);
    /*TestKernel << <dimGrid, dimblock, dimblock.x * dimblock.y * sizeof(char) >> >
    (dev_srcright, dev_srcrightPitch, dev_rightTrans, dev_rightTransPitch, COLS, ROWS);*/
    cudaThreadSynchronize();

    cudaError_t res = cudaGetLastError();
    if (res != cudaSuccess)
    printf("%s : %s\n", cudaGetErrorName(res), cudaGetErrorString(res));

    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&elapsedtime, start, stop);
    cudaEventDestroy(start);
    cudaEventDestroy(stop);
    cout << elapsedtime << "msec" << endl;


    Mat temp_output(ROWS, COLS, 0);
    cudaMemcpy2D((int*)temp_output.data, temp_output.step, dev_leftTrans, dev_leftTransPitch, COLS, ROWS, cudaMemcpyDeviceToHost);
    imshow("temp", temp_output);
    waitKey(0);
    return 0;
}

这是我的环境vs2013,cuda v6.5 设备'属性 低于

Major revision number:         3
Minor revision number:         0
 Name:                          GeForce GTX 760 (192-bit)
 Total global memory:           1610612736
 Total shared memory per block: 49152
 Total registers per block:     65536
 Warp size:                     32
 Maximum memory pitch:          2147483647
 Maximum threads per block:     1024
 Maximum dimension 0 of block:  1024
 Maximum dimension 1 of block:  1024
 Maximum dimension 2 of block:  64
 Maximum dimension 0 of grid:   2147483647
 Maximum dimension 1 of grid:   65535
 Maximum dimension 2 of grid:   65535
 Clock rate:                    888500
 Total constant memory:         65536
 Texture alignment:             512
 Concurrent copy and execution: Yes
 Number of multiprocessors:     6
 Kernel execution timeout:      Yes

一个问题是您的内核没有进行任何线程检查。

当您像这样定义块网格时:

dim3 dimGrid(ceil((float)COLS / dimblock.x), ceil((float)ROWS /  dimblock.y));

您通常会启动 额外的 块。原因是如果 COLSROW 不能被块尺寸(在本例中为 3)整除,那么在每种情况下您将获得额外的块来覆盖剩余部分。

这些额外的块将有一些线程在做有用的工作,还有一些线程会越界访问。为了防止这种情况,通常在内核中进行线程检查以防止越界访问:

const unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
const unsigned int y = blockIdx.y * blockDim.y + threadIdx.y;

if ((x < COLS) && (y < ROWS)) {  // add this
  unsigned char src_val = src[x + y * src_pitch];
  dst[x + y * dst_pitch] = src_val;
  }   // add this

这意味着只有具有有效(边界内)xy 的线程才会实际进行任何访问。

顺便说一句,出于性能原因,(3,3) 可能不是块维度的特别好的选择。创建乘积为 32 的倍数的块维度通常是个好主意,因此 (32,4) 或 (16,16) 可能是更好选择的示例。

代码中的另一个问题是 pitchdst 数组的使用。 间距始终以字节为单位,因此首先需要将 dst 指针转换为 char*,计算行偏移量,然后将其转换回 int*:

int* dst_row = (int*)(((char*)dst) + y * dst_pitch);
dst_row[x] = src_val;