CUDA C/C++: Async kernel error: an illegal memory access was encountered
CUDA C/C++: Async kernel error: an illegal memory access was encountered
我是 GPU 并行编程的初学者。我正在尝试在 512x512
RGB 图像大小和 9x9 filter
.
之间实现卷积运算
我收到一个错误:遇到非法内存访问。
这是我的内核函数:
__global__ void blurImgKernel(uchar3 * inPixels, int width, int height,
float * filter, int filterWidth,
uchar3 * outPixels)
{
// TODO
int c = threadIdx.x + blockIdx.x * blockDim.x;
int r = threadIdx.y + blockIdx.y * blockDim.y;
int padding = filterWidth / 2;
if (r < height && c < width)
{
int idx = r * width + c;
float red = 0;
float green = 0;
float blue = 0;
// Do some calculation here....
// ............................
outPixels[idx].x = (uint8_t)red;
outPixels[idx].y = (uint8_t)green;
outPixels[idx].z = (uint8_t)blue;
}
}
我如何调用这个函数:
dim3 blockSize(32, 32);
uchar3 *d_inPixels, *d_outPixels;
CHECK(cudaMalloc(&d_inPixels, width*height*sizeof(uchar3)));
CHECK(cudaMalloc(&d_outPixels, width*height*sizeof(uchar3)));
// Copy data to device memories
CHECK(cudaMemcpy(d_inPixels, inPixels,
width*height*sizeof(uchar3), cudaMemcpyHostToDevice));
// Set grid size and call kernel (remember to check kernel error)
dim3 gridSize((height - 1) / blockSize.x + 1, (width - 1) / blockSize.y + 1);
blurImgKernel<<<gridSize, blockSize>>>(d_inPixels, width, height, filter, filterWidth, d_outPixels);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
// Copy result from device memories
CHECK(cudaMemcpy(outPixels, d_outPixels,
width*height*sizeof(uchar3), cudaMemcpyDeviceToHost));
当我使用 cuda-memcheck
获取更多错误详细信息时,我遇到了很多这样的错误:
========= Invalid __global__ read of size 4
========= at 0x00000490 in blurImgKernel(uchar3*, int, int, float*, int, uchar3*)
========= by thread (0,10,0) in block (0,0,0)
========= Address 0x562bcf426000 is out of bounds
========= Device Frame:blurImgKernel(uchar3*, int, int, float*, int, uchar3*) (blurImgKernel(uchar3*, int, int, float*, int, uchar3*) : 0x490)
========= Saved host backtrace up to driver entry point at kernel launch time
我看到我的代码看起来不错,但我认为划分 blocksize 和 gridsize 步长有问题。谁能帮我弄清楚?
实际上,我忘记在设备上分配filter
。我分配后一切正常
我是 GPU 并行编程的初学者。我正在尝试在 512x512
RGB 图像大小和 9x9 filter
.
我收到一个错误:遇到非法内存访问。
这是我的内核函数:
__global__ void blurImgKernel(uchar3 * inPixels, int width, int height,
float * filter, int filterWidth,
uchar3 * outPixels)
{
// TODO
int c = threadIdx.x + blockIdx.x * blockDim.x;
int r = threadIdx.y + blockIdx.y * blockDim.y;
int padding = filterWidth / 2;
if (r < height && c < width)
{
int idx = r * width + c;
float red = 0;
float green = 0;
float blue = 0;
// Do some calculation here....
// ............................
outPixels[idx].x = (uint8_t)red;
outPixels[idx].y = (uint8_t)green;
outPixels[idx].z = (uint8_t)blue;
}
}
我如何调用这个函数:
dim3 blockSize(32, 32);
uchar3 *d_inPixels, *d_outPixels;
CHECK(cudaMalloc(&d_inPixels, width*height*sizeof(uchar3)));
CHECK(cudaMalloc(&d_outPixels, width*height*sizeof(uchar3)));
// Copy data to device memories
CHECK(cudaMemcpy(d_inPixels, inPixels,
width*height*sizeof(uchar3), cudaMemcpyHostToDevice));
// Set grid size and call kernel (remember to check kernel error)
dim3 gridSize((height - 1) / blockSize.x + 1, (width - 1) / blockSize.y + 1);
blurImgKernel<<<gridSize, blockSize>>>(d_inPixels, width, height, filter, filterWidth, d_outPixels);
cudaError_t errSync = cudaGetLastError();
cudaError_t errAsync = cudaDeviceSynchronize();
if (errSync != cudaSuccess)
printf("Sync kernel error: %s\n", cudaGetErrorString(errSync));
if (errAsync != cudaSuccess)
printf("Async kernel error: %s\n", cudaGetErrorString(errAsync));
// Copy result from device memories
CHECK(cudaMemcpy(outPixels, d_outPixels,
width*height*sizeof(uchar3), cudaMemcpyDeviceToHost));
当我使用 cuda-memcheck
获取更多错误详细信息时,我遇到了很多这样的错误:
========= Invalid __global__ read of size 4 ========= at 0x00000490 in blurImgKernel(uchar3*, int, int, float*, int, uchar3*) ========= by thread (0,10,0) in block (0,0,0) ========= Address 0x562bcf426000 is out of bounds ========= Device Frame:blurImgKernel(uchar3*, int, int, float*, int, uchar3*) (blurImgKernel(uchar3*, int, int, float*, int, uchar3*) : 0x490) ========= Saved host backtrace up to driver entry point at kernel launch time
我看到我的代码看起来不错,但我认为划分 blocksize 和 gridsize 步长有问题。谁能帮我弄清楚?
实际上,我忘记在设备上分配filter
。我分配后一切正常