这个简单的cuda程序有什么问题?

What's wrong with this simple cuda program?

我只是想使用 CUDA 来清空图像。但是 "before" 和 "after" 我得到了相同的原始图像。想不通问题。

sumKernel.cu:

#include "sumKernel.h"

__global__ void _sumKernel(char *image, int width, int height, char *kernel, int kerwidth, int kerheight) {
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    image[idx] = 0;
}

void sumKernel(char *image, int width, int height, char *kernel, int kerwidth, int kerheight) {
    dim3 blocks(1);
    dim3 threads(width*height);
    _sumKernel<<<blocks, threads>>>(image, width, height, kernel, kerwidth, kerheight);
}

sumKernel.h:

void sumKernel(char *image, int width, int height, char *kernel, int kerwidth, int kerheight);

main.cpp:

findCudaDevice(argc, (const char **)argv);

Mat image = imread("a.jpg");
Mat gray; cvtColor(image, gray, CV_BGR2GRAY);
imshow("before", gray);

char *gray_g;
cudaMalloc((void **)&gray_g, gray.size().area());
cudaMemcpy(gray_g, gray.data, sizeof(char)*gray.size().area(), cudaMemcpyHostToDevice);

char kernel[9];
char *kernel_g;
cudaMalloc((void **)&kernel_g, sizeof(char)*9);

sumKernel(gray_g, gray.cols, gray.rows, kernel, 3, 3);

cudaMemcpy(gray.data, gray_g, sizeof(char)*gray.size().area(), cudaMemcpyDeviceToHost);

imshow("after", gray);
waitKey(0);

cudaFree(kernel);
cudaFree(gray_g);

(发表我的评论作为答案)

check 所有 API 调用错误。很可能是您的宽度*高度线程的块大小对于单个块来说太大了。