为什么我的 CUDA 光线追踪器会给我这个线程布局的错误代码 700?
Why is my CUDA ray tracer giving me error code 700 with this thread layout?
我的目标是用 C++ 中的 CUDA 和 phong 着色模型编写一个简单的光线追踪器。它应该计算适当的颜色并将它们写入 GPU 上的帧缓冲区,然后我将帧缓冲区中的值写入 CPU 上的 .ppm 文件。我拥有的图像大小是 512x512,因此对于内核调用中的线程布局,我使用了以下参数:dim3 thread_blocks(16, 16)
和 dim3 threads_per_block(32, 32)
.
理论上,这应该可以让我访问 (16*16) * (32*32) threads
,它等于图像中的像素数量 (512 * 512
)。但这给了我一个 CUDA 错误,在我将数据从设备复制回主机的那一行,cudaMemcpy 的错误代码为 700。使用较小数量的 threads_per_block
,如 dim3 threads_per_block(16, 16)
可以正常工作,但当然只会渲染图像的 1/4。
我也尝试过其他线程布局,甚至那些针对 2D 图像专门解释的布局也产生了同样的错误,所以这就是我需要帮助的地方。
内核调用:
void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
// empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
// cudaDeviceSynchronize();
Vec3f* fb_device = nullptr;
Sphere* spheres_dv = nullptr;
Light* light_dv = nullptr;
Vec3f* origin_dv = nullptr;
checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
float time = 0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("%f ms\n", time);
checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));
cudaEventDestroy(start);
cudaEventDestroy(stop);
checkErrorsCuda(cudaFree(fb_device));
checkErrorsCuda(cudaFree(spheres_dv));
checkErrorsCuda(cudaFree(light_dv));
checkErrorsCuda(cudaFree(origin_dv));
}
cast_ray
函数:
__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
int i = (blockIdx.x * blockDim.x) + threadIdx.x;
int j = (blockIdx.y * blockDim.y) + threadIdx.y;
int tid = (j*WIDTH) + i;
if(i >= WIDTH || j >= HEIGHT) return;
Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
Vec3f *dir = new Vec3f(ij - *origin);
Ray r(*origin, *dir);
float intersections[3];
int hp = -1;
for(int ii = 0; ii < 3; ii++) {
intersections[ii] = r.has_intersection(spheres[ii]);
}
int asize = sizeof(intersections) / sizeof(*intersections);
if(asize == 1) {
hp = intersections[0] < 0 ? -1 : 0;
} else {
if(asize != 0) {
float min_val = 100.0;
for (int ii = 0; ii < asize; ii++) {
if (intersections[ii] < 0.0) continue;
else if (intersections[ii] < min_val) {
min_val = intersections[ii];
hp = ii;
}
}
}
}
if(hp == -1) {
fb[tid] = Color(94, 156, 255);
} else {
auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
fb[tid] = color;
}
}
错误信息:CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)().
(对应的行是内核调用函数中printf
后面的cudaMemcpy
)
使用 cuda-memcheck
我得到以下信息:
========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
========= at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
========= by thread (0,7,0) in block (2,5,0)
(这是在 RTX 2060 SUPER 上试过的)
将 Vec3f *dir = new Vec3f(ij - *origin);
更改为 Vec3f dir(ij - *origin);
解决了问题! dir
作为指针是以前不再需要的代码迭代的残余,但即便如此,也不要忘记 delete
所有 new
。
我的目标是用 C++ 中的 CUDA 和 phong 着色模型编写一个简单的光线追踪器。它应该计算适当的颜色并将它们写入 GPU 上的帧缓冲区,然后我将帧缓冲区中的值写入 CPU 上的 .ppm 文件。我拥有的图像大小是 512x512,因此对于内核调用中的线程布局,我使用了以下参数:dim3 thread_blocks(16, 16)
和 dim3 threads_per_block(32, 32)
.
理论上,这应该可以让我访问 (16*16) * (32*32) threads
,它等于图像中的像素数量 (512 * 512
)。但这给了我一个 CUDA 错误,在我将数据从设备复制回主机的那一行,cudaMemcpy 的错误代码为 700。使用较小数量的 threads_per_block
,如 dim3 threads_per_block(16, 16)
可以正常工作,但当然只会渲染图像的 1/4。
我也尝试过其他线程布局,甚至那些针对 2D 图像专门解释的布局也产生了同样的错误,所以这就是我需要帮助的地方。
内核调用:
void run_kernel(const int size, Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
// empty_kernel<<<dim3(16, 16, 1), dim3(32, 32, 1)>>>();
// cudaDeviceSynchronize();
Vec3f* fb_device = nullptr;
Sphere* spheres_dv = nullptr;
Light* light_dv = nullptr;
Vec3f* origin_dv = nullptr;
checkErrorsCuda(cudaMalloc((void**) &fb_device, sizeof(Vec3f) * size));
checkErrorsCuda(cudaMemcpy((void*) fb_device, fb, sizeof(Vec3f) * size, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &spheres_dv, sizeof(Sphere) * 3));
checkErrorsCuda(cudaMemcpy((void*) spheres_dv, spheres, sizeof(Sphere) * 3, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &light_dv, sizeof(Light) * 1));
checkErrorsCuda(cudaMemcpy((void*) light_dv, light, sizeof(Light) * 1, cudaMemcpyHostToDevice));
checkErrorsCuda(cudaMalloc((void**) &origin_dv, sizeof(Vec3f) * 1));
checkErrorsCuda(cudaMemcpy((void*) origin_dv, origin, sizeof(Vec3f) * 1, cudaMemcpyHostToDevice));
cudaEvent_t start, stop;
float time = 0;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start, 0);
cast_ray<<<dim3(16, 16), dim3(32, 32)>>>(fb_device, spheres_dv, light_dv, origin_dv);
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&time, start, stop);
printf("%f ms\n", time);
checkErrorsCuda(cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost));
cudaEventDestroy(start);
cudaEventDestroy(stop);
checkErrorsCuda(cudaFree(fb_device));
checkErrorsCuda(cudaFree(spheres_dv));
checkErrorsCuda(cudaFree(light_dv));
checkErrorsCuda(cudaFree(origin_dv));
}
cast_ray
函数:
__global__ void cast_ray(Vec3f* fb, Sphere* spheres, Light* light, Vec3f* origin) {
int i = (blockIdx.x * blockDim.x) + threadIdx.x;
int j = (blockIdx.y * blockDim.y) + threadIdx.y;
int tid = (j*WIDTH) + i;
if(i >= WIDTH || j >= HEIGHT) return;
Vec3f ij(2 * (float((i) + 0.5) / (WIDTH - 1)) - 1, 1 - 2 * (float((j) + 0.5) / (HEIGHT - 1)), -1);
Vec3f *dir = new Vec3f(ij - *origin);
Ray r(*origin, *dir);
float intersections[3];
int hp = -1;
for(int ii = 0; ii < 3; ii++) {
intersections[ii] = r.has_intersection(spheres[ii]);
}
int asize = sizeof(intersections) / sizeof(*intersections);
if(asize == 1) {
hp = intersections[0] < 0 ? -1 : 0;
} else {
if(asize != 0) {
float min_val = 100.0;
for (int ii = 0; ii < asize; ii++) {
if (intersections[ii] < 0.0) continue;
else if (intersections[ii] < min_val) {
min_val = intersections[ii];
hp = ii;
}
}
}
}
if(hp == -1) {
fb[tid] = Color(94, 156, 255);
} else {
auto color = get_color_at(r, intersections[hp], light, spheres[hp], spheres);
fb[tid] = color;
}
}
错误信息:CUDA error at ./main.cu::195 with error code 700 for cudaMemcpy(fb, fb_device, sizeof(Vec3f) * size, cudaMemcpyDeviceToHost)().
(对应的行是内核调用函数中printf
后面的cudaMemcpy
)
使用 cuda-memcheck
我得到以下信息:
========= Error: process didn't terminate successfully
========= Out-of-range Shared or Local Address
========= at 0x00000100 in __cuda_syscall_mc_dyn_globallock_check
========= by thread (0,7,0) in block (2,5,0)
(这是在 RTX 2060 SUPER 上试过的)
将 Vec3f *dir = new Vec3f(ij - *origin);
更改为 Vec3f dir(ij - *origin);
解决了问题! dir
作为指针是以前不再需要的代码迭代的残余,但即便如此,也不要忘记 delete
所有 new
。