将非方形图像存储在一个卷中
Storing non-square images in a volume
我一直在尝试将非正方形图像存储到一个卷中。当我想存储大小为 w512 x h512
的方形图像 512 次时,我的代码有效,但它不适用于大小为 w512 x h1024
的非方形图像。我在这一行收到 unspecified launch failure
描述的错误:gpuErrchk( cudaDeviceSynchronize() );
我不确定为什么会这样?我试图设置我的代码,以便它以合并的方式访问数据。我的问题有替代解决方案吗?
这是我的内核:
__global__ void
copySlice2Volume2(float *buffer, float *slice, int height, int width, int frameIdx)
{
int tid = (blockIdx.x * width) + threadIdx.x;
buffer[tid + (frameIdx*width*height)] = slice[tid];
__syncthreads();
}
下面是调用上面内核的函数:
void testStorage() {
int nFrames = 512;
int width = 0;
int height = 0;
// 8-bit unsigned char images
Mat frame, floatFrame;
frame = imread("C:/Matlab code/im.png", CV_LOAD_IMAGE_GRAYSCALE);
// convert uchar images to float images
frame.convertTo(floatFrame, CV_32F, 1.0/255.0f);
width = frame.step;
height = frame.rows;
cout << "width: " << width << " height: " << height << endl;
float *gpuBuffer;
float *testImage;
gpuErrchk( cudaMalloc( (void**) &gpuBuffer, sizeof(float) * width * height * nFrames)); // storage init for buffer
gpuErrchk( cudaMemset(gpuBuffer, 0, sizeof(float) * width * height * nFrames)); // set mem to 0
gpuErrchk( cudaMalloc( (void**) &testImage, sizeof(float) * width * height )); // storage init for image
gpuErrchk( cudaMemset(testImage, 0, sizeof(float) * width * height )); // set mem to 0
gpuErrchk( cudaMemcpy( testImage, floatFrame.ptr<float>(), sizeof (float) * width * height, cudaMemcpyHostToDevice) );
// num of threads
dim3 Threads(width);
// num of blocks
dim3 Blocks(height);
for(int i = 0; i < nFrames; i++)
{
copySlice2Volume2<<< Blocks, Threads >>> (gpuBuffer, testImage, width, height, i);
}
gpuErrchk( cudaDeviceSynchronize() ); // error here
printf("Cuda status2: %s\n", cudaGetErrorString( cudaGetLastError() ) );
gpuErrchk( cudaFree(gpuBuffer) );
gpuErrchk( cudaFree(testImage) );
}
在内核调用中交换了宽度和高度参数,这就是错误的原因。一旦纠正,一切正常。这是更正后的内核:
__global__ void
copySlice2Volume2(float *buffer, float *slice, int width, int height, int frameIdx)
{
int tid = (blockIdx.x * width) + threadIdx.x;
buffer[tid + (frameIdx*width*height)] = slice[tid];
__syncthreads();
}
我一直在尝试将非正方形图像存储到一个卷中。当我想存储大小为 w512 x h512
的方形图像 512 次时,我的代码有效,但它不适用于大小为 w512 x h1024
的非方形图像。我在这一行收到 unspecified launch failure
描述的错误:gpuErrchk( cudaDeviceSynchronize() );
我不确定为什么会这样?我试图设置我的代码,以便它以合并的方式访问数据。我的问题有替代解决方案吗?
这是我的内核:
__global__ void
copySlice2Volume2(float *buffer, float *slice, int height, int width, int frameIdx)
{
int tid = (blockIdx.x * width) + threadIdx.x;
buffer[tid + (frameIdx*width*height)] = slice[tid];
__syncthreads();
}
下面是调用上面内核的函数:
void testStorage() {
int nFrames = 512;
int width = 0;
int height = 0;
// 8-bit unsigned char images
Mat frame, floatFrame;
frame = imread("C:/Matlab code/im.png", CV_LOAD_IMAGE_GRAYSCALE);
// convert uchar images to float images
frame.convertTo(floatFrame, CV_32F, 1.0/255.0f);
width = frame.step;
height = frame.rows;
cout << "width: " << width << " height: " << height << endl;
float *gpuBuffer;
float *testImage;
gpuErrchk( cudaMalloc( (void**) &gpuBuffer, sizeof(float) * width * height * nFrames)); // storage init for buffer
gpuErrchk( cudaMemset(gpuBuffer, 0, sizeof(float) * width * height * nFrames)); // set mem to 0
gpuErrchk( cudaMalloc( (void**) &testImage, sizeof(float) * width * height )); // storage init for image
gpuErrchk( cudaMemset(testImage, 0, sizeof(float) * width * height )); // set mem to 0
gpuErrchk( cudaMemcpy( testImage, floatFrame.ptr<float>(), sizeof (float) * width * height, cudaMemcpyHostToDevice) );
// num of threads
dim3 Threads(width);
// num of blocks
dim3 Blocks(height);
for(int i = 0; i < nFrames; i++)
{
copySlice2Volume2<<< Blocks, Threads >>> (gpuBuffer, testImage, width, height, i);
}
gpuErrchk( cudaDeviceSynchronize() ); // error here
printf("Cuda status2: %s\n", cudaGetErrorString( cudaGetLastError() ) );
gpuErrchk( cudaFree(gpuBuffer) );
gpuErrchk( cudaFree(testImage) );
}
在内核调用中交换了宽度和高度参数,这就是错误的原因。一旦纠正,一切正常。这是更正后的内核:
__global__ void
copySlice2Volume2(float *buffer, float *slice, int width, int height, int frameIdx)
{
int tid = (blockIdx.x * width) + threadIdx.x;
buffer[tid + (frameIdx*width*height)] = slice[tid];
__syncthreads();
}