为什么我的 CUDA 光线追踪器会给我这个线程布局的错误代码 700？

Question

我的目标是用 C++ 中的 CUDA 和 phong 着色模型编写一个简单的光线追踪器。它应该计算适当的颜色并将它们写入 GPU 上的帧缓冲区，然后我将帧缓冲区中的值写入 CPU 上的 .ppm 文件。我拥有的图像大小是 512x512，因此对于内核调用中的线程布局，我使用了以下参数：dim3 thread_blocks(16, 16) 和 dim3 threads_per_block(32, 32).

理论上，这应该可以让我访问 (16*16) * (32*32) threads，它等于图像中的像素数量 (512 * 512)。但这给了我一个 CUDA 错误，在我将数据从设备复制回主机的那一行，cudaMemcpy 的错误代码为 700。使用较小数量的 threads_per_block，如 dim3 threads_per_block(16, 16) 可以正常工作，但当然只会渲染图像的 1/4。

我也尝试过其他线程布局，甚至那些针对 2D 图像专门解释的布局也产生了同样的错误，所以这就是我需要帮助的地方。

内核调用：

void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    // empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
    // cudaDeviceSynchronize();

    Vec3f* fb_device = nullptr;
    Sphere* spheres_dv = nullptr;
    Light* light_dv = nullptr;
    Vec3f* origin_dv = nullptr;

    checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
    checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
    checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
    checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));

    checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
    checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));

    cudaEvent_t start, stop;
    float time = 0;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);
    cudaEventRecord(start, 0);

    cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
    
    cudaEventRecord(stop, 0);
    cudaEventSynchronize(stop);
    cudaEventElapsedTime(&time, start, stop);
    printf("%f ms\n", time);

    checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));

    cudaEventDestroy(start);
    cudaEventDestroy(stop);

    checkErrorsCuda(cudaFree(fb_device));
    checkErrorsCuda(cudaFree(spheres_dv));
    checkErrorsCuda(cudaFree(light_dv));
    checkErrorsCuda(cudaFree(origin_dv));
}

cast_ray函数：

__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
    int i = (blockIdx.x * blockDim.x) + threadIdx.x;
    int j = (blockIdx.y * blockDim.y) + threadIdx.y;

    int tid = (j*WIDTH) + i;
    if(i >= WIDTH || j >= HEIGHT) return;

    Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
    Vec3f *dir = new Vec3f(ij - *origin);
    Ray r(*origin, *dir);

    float intersections[3];
    int hp = -1;
    for(int ii = 0; ii < 3; ii++) {
        intersections[ii] = r.has_intersection(spheres[ii]);
    }

    int asize = sizeof(intersections) / sizeof(*intersections);
    if(asize == 1) {
        hp = intersections[0] < 0 ? -1 : 0;
    } else {
        if(asize != 0) {
            float min_val = 100.0;
            for (int ii = 0; ii < asize; ii++) {
                if (intersections[ii] < 0.0) continue;
                else if (intersections[ii] < min_val) {
                    min_val = intersections[ii];
                    hp = ii;
                }
            }
        }
    }

    if(hp == -1) {
        fb[tid] = Color(94, 156, 255);
    } else {
        auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
        fb[tid] = color;
    }
}

错误信息：CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)().（对应的行是内核调用函数中printf后面的cudaMemcpy）

使用 cuda-memcheck 我得到以下信息：

========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
=========     at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
=========     by thread (0,7,0) in block (2,5,0)

（这是在 RTX 2060 SUPER 上试过的）

Answer 1

将 Vec3f *dir = new Vec3f(ij - *origin); 更改为 Vec3f dir(ij - *origin); 解决了问题！ dir 作为指针是以前不再需要的代码迭代的残余，但即便如此，也不要忘记 delete 所有 new。

为什么我的 CUDA 光线追踪器会给我这个线程布局的错误代码 700？

Why is my CUDA ray tracer giving me error code 700 with this thread layout?

c++

memory

multithreading

cuda

gpu