'an illegal memory access' 尝试写入使用 cudaMalloc3D 分配的二维数组时
'an illegal memory access' when trying to write to a 2D array allocated using cudaMalloc3D
我正在尝试使用 cudaMalloc3D 将展平的二维数组的内存分配和复制到设备上,以测试 cudaMalloc3D 的性能。但是当我尝试从内核写入数组时,它会抛出 'an illegal memory access was encountered' 异常。如果我只是从数组中读取,程序运行良好,但当我尝试写入时,出现错误。对此的任何帮助将不胜感激。下面是我的代码和编译代码的语法。
编译使用
nvcc -O2 -arch sm_20 test.cu
代码:test.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 10;
// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;
// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);
// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );
// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;
// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;
// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
return EXIT_SUCCESS;
}
此行没有出现错误的原因:
REAL tmp = unew_row[j]; // no error on this line
是因为编译器正在优化那一行。它没有做任何有用的事情,所以编译器完全消除了它。编译器警告:
xxx.cu(87): warning: variable "tmp" was declared but never referenced
是对这种效果的暗示。
您的代码几乎是正确的。问题在这里:
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
应该是:
REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);
内核中的 i
变量是 width(即 X)维度。
j
变量是 高度 (即 Y)维度。
高度是指你在哪一行,因此行间距应该乘以高度参数,即j
,而不是i
。
同样,虽然它不是特定维度的特定故障的根源,但此代码也可能不是您想要的:
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
例如,如果您打算计算行的偏移量然后索引到行中(例如,可能设置分配中的每个元素)那么我想您会想要使用 i
不是 j
作为你的最终索引:
REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error
但是,对于这个特定示例,这并不是非法内存访问的实际来源。
我正在尝试使用 cudaMalloc3D 将展平的二维数组的内存分配和复制到设备上,以测试 cudaMalloc3D 的性能。但是当我尝试从内核写入数组时,它会抛出 'an illegal memory access was encountered' 异常。如果我只是从数组中读取,程序运行良好,但当我尝试写入时,出现错误。对此的任何帮助将不胜感激。下面是我的代码和编译代码的语法。
编译使用
nvcc -O2 -arch sm_20 test.cu
代码:test.cu
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#define PI 3.14159265
#define NX 8192 /* includes boundary points on both end */
#define NY 4096 /* includes boundary points on both end */
#define NZ 1 /* needed for cudaMalloc3D */
#define N_THREADS_X 16
#define N_THREADS_Y 16
#define N_BLOCKS_X NX/N_THREADS_X
#define N_BLOCKS_Y NY/N_THREADS_Y
#define LX 4.0 /* length of the domain in x-direction */
#define LY 2.0 /* length of the domain in x-direction */
#define dx (REAL) ( LX/( (REAL) (NX) ) )
#define cSqrd 5.0
#define dt (REAL) ( 0.4 * dx / sqrt(cSqrd) )
#define FACTOR ( cSqrd * (dt*dt)/(dx*dx) )
#define IC (i + j*NX) /* (i,j) */
#define IM1 (i + j*NX - 1) /* (i-1,j) */
#define IP1 (i + j*NX + 1) /* (i+1,j) */
#define JM1 (i + (j-1)*NX) /* (i,j-1) */
#define JP1 (i + (j+1)*NX) /* (i,j+1) */
// Macro for checking CUDA errors following a CUDA launch or API call
#define cudaCheckError() {\
cudaError_t e = cudaGetLastError();\
if( e != cudaSuccess ) {\
printf("\nCuda failure %s:%d: '%s'\n",__FILE__,__LINE__,cudaGetErrorString(e));\
exit(EXIT_FAILURE);\
}\
}
typedef double REAL;
typedef int INT;
void meshGrid ( REAL *x, REAL *y )
{
INT i,j;
REAL a;
for (j=0; j<NY; j++) {
a = dx * ( (REAL) j );
for (i=0; i<NX; i++) {
x[IC] = dx * ( (REAL) i );
y[IC] = a;
}
}
}
void initWave ( REAL *u, REAL *uold, REAL *x, REAL *y )
{
INT i,j;
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
u[IC] = 0.1 * (4.0*x[IC]-x[IC]*x[IC]) * ( 2.0*y[IC] - y[IC]*y[IC] );
}
}
for (j=1; j<NY-1; j++) {
for (i=1; i<NX-1; i++) {
uold[IC] = u[IC] + 0.5*FACTOR*( u[IP1] + u[IM1] + u[JP1] + u[JM1] - 4.0*u[IC] );
}
}
}
__global__ void solveWaveGPU ( cudaPitchedPtr uold, cudaPitchedPtr u, cudaPitchedPtr unew )
{
INT i,j;
i = blockIdx.x*blockDim.x + threadIdx.x;
j = blockIdx.y*blockDim.y + threadIdx.y;
if (i>0 && i < (NX-1) && j>0 && j < (NY-1) ) {
char *unewPtr = (char *) unew.ptr;
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
}
}
INT main(INT argc, char *argv[])
{
INT nTimeSteps = 10;
// pointers for the host side
REAL *unew, *u, *uold, *uFinal, *x, *y;
// allocate memory on the host
unew = (REAL *)calloc(NX*NY,sizeof(REAL));
u = (REAL *)calloc(NX*NY,sizeof(REAL));
uold = (REAL *)calloc(NX*NY,sizeof(REAL));
uFinal = (REAL *)calloc(NX*NY,sizeof(REAL));
x = (REAL *)calloc(NX*NY,sizeof(REAL));
y = (REAL *)calloc(NX*NY,sizeof(REAL));
// pointer for the device side
size_t pitch = NX * sizeof(REAL);
cudaPitchedPtr d_u, d_uold, d_unew, d_tmp;
cudaExtent myExtent = make_cudaExtent(pitch, NY, NZ);
// allocate 3D memory on the device
cudaMalloc3D( &d_u, myExtent ); cudaCheckError();
cudaMalloc3D( &d_uold, myExtent ); cudaCheckError();
cudaMalloc3D( &d_unew, myExtent ); cudaCheckError();
// initialize grid and wave
meshGrid( x, y );
initWave( u, uold, x, y );
// copy host memory to 3D device memory
cudaMemcpy3DParms cpy3D = { 0 };
cpy3D.kind = cudaMemcpyHostToDevice;
// copying u to d_u
cpy3D.srcPtr = make_cudaPitchedPtr(u, pitch, NX, NY);
cpy3D.dstPtr = d_u;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// copying uold to d_uold
cpy3D.srcPtr = make_cudaPitchedPtr(uold, pitch, NX, NY);
cpy3D.dstPtr = d_uold;
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
// set up the GPU grid/block model
dim3 dimGrid ( N_BLOCKS_X , N_BLOCKS_Y );
dim3 dimBlock ( N_THREADS_X, N_THREADS_Y );
for ( INT n = 1; n < nTimeSteps + 1; n++ ) {
solveWaveGPU <<< dimGrid, dimBlock >>> ( d_uold, d_u, d_unew );
cudaThreadSynchronize();
cudaCheckError();
d_tmp = d_uold;
d_uold = d_u;
d_u = d_unew;
d_unew = d_tmp;
}
// copy the memory back to host
cpy3D.kind = cudaMemcpyDeviceToHost;
// copying d_unew to uFinal
cpy3D.srcPtr = d_unew;
cpy3D.dstPtr = make_cudaPitchedPtr(uFinal, pitch, NX, NY);
cpy3D.extent = myExtent;
cudaMemcpy3D( &cpy3D ); cudaCheckError();
free(u); cudaFree(d_u.ptr);
free(unew); cudaFree(d_unew.ptr);
free(uold); cudaFree(d_uold.ptr);
free(uFinal); free(x); free(y);
return EXIT_SUCCESS;
}
此行没有出现错误的原因:
REAL tmp = unew_row[j]; // no error on this line
是因为编译器正在优化那一行。它没有做任何有用的事情,所以编译器完全消除了它。编译器警告:
xxx.cu(87): warning: variable "tmp" was declared but never referenced
是对这种效果的暗示。
您的代码几乎是正确的。问题在这里:
REAL *unew_row = (REAL *) (unewPtr + i * unew.pitch);
应该是:
REAL *unew_row = (REAL *) (unewPtr + j * unew.pitch);
内核中的 i
变量是 width(即 X)维度。
j
变量是 高度 (即 Y)维度。
高度是指你在哪一行,因此行间距应该乘以高度参数,即j
,而不是i
。
同样,虽然它不是特定维度的特定故障的根源,但此代码也可能不是您想要的:
REAL tmp = unew_row[j]; // no error on this line
unew_row[j] = 1.2; // this is where I get the error
例如,如果您打算计算行的偏移量然后索引到行中(例如,可能设置分配中的每个元素)那么我想您会想要使用 i
不是 j
作为你的最终索引:
REAL tmp = unew_row[i]; // no error on this line
unew_row[i] = 1.2; // this is where I get the error
但是,对于这个特定示例,这并不是非法内存访问的实际来源。