如何为cuda内核创建一个临时二维变量

How to create a temporary 2D variable for cuda kernel

我在CUDA全局内核中声明的IxIy会由于未知原因导致遇到非法内存访问。这是代码:

#include "opencv2/opencv.hpp"
#include "opencv2/highgui.hpp"
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <omp.h>
#include <stdlib.h>

// Cuda
#include <cuda.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"


#define CHECK_FINAL_RESULT
//#define CHECK_LOADING_DATA
using namespace std;
const int TSIZEX = 32;
const int TSIZEY = 256;
const int ft_size = 1;

// Mathematical algorithms
#define isl_min(x,y)        ((x) < (y) ? (x) : (y))         // compare value x is lesser than y, if correct use x, if wrong use y
#define isl_max(x,y)        ((x) > (y) ? (x) : (y))         // comapre value x is larger than y, if correct use y, if wrong use x

__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);
__global__ void cudapipeline_harris(int  C, int  R, float* img, float* harris);

__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j) {

    return (A[i - 1][j - 1] * B[i - 1][j - 1] +
        A[i - 1][j] * B[i - 1][j] +
        A[i - 1][j + 1] * B[i - 1][j + 1] +
        A[i][j - 1] * B[i][j - 1] +
        A[i][j] * B[i][j] +
        A[i][j + 1] * B[i][j + 1] +
        A[i + 1][j - 1] * B[i + 1][j - 1] +
        A[i + 1][j] * B[i + 1][j] +
        A[i + 1][j + 1] * B[i + 1][j + 1]);
}



__global__ void cudapipeline_harris(int  C, int  R, float* img, float* harris) {

    int idx = threadIdx.x + blockIdx.x * blockDim.x;
    int idy = threadIdx.y + blockIdx.y * blockDim.y;
    int idz = threadIdx.z + blockIdx.z * blockDim.z;

    float Ix[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];
    float Iy[TSIZEX + 2 * ft_size][TSIZEY + 2 * ft_size];

    for (int Ti = idx; Ti < (float)(R / TSIZEX); Ti += gridDim.x * blockDim.x)
    //if (Ti < (R / TSIZEX))
    {
        //for (int Tj = 0; Tj <= (float)(C / TSIZEY); Tj++)
        for (int Tj = idy; Tj < (float)(C/ TSIZEY); Tj += gridDim.y * blockDim.y)
        {
            int bot0, top0, right0, left0;
            int height, width;

            bot0 = isl_min(isl_max(Ti * TSIZEX, ft_size), R - ft_size);
            top0 = isl_min((Ti + 1) * TSIZEX, R - ft_size);
            left0 = isl_min(isl_max(Tj * TSIZEY, ft_size), C - ft_size);
            right0 = isl_min((Tj + 1) * TSIZEY, C - ft_size);

            width = right0 - left0;
            height = top0 - bot0;


            for (int i = bot0; i <= top0; i++)
            {
                for (int j = left0; j <= right0; j++)
                {
                    //printf("Ix : %d ", i - bot0);
                    Ix[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i + 1) * C + j - 1] * 0.0833333333333f +
                        img[(i + 1) * C + j] * 0.166666666667f +
                        img[(i - 1) * C + j] * -0.166666666667f +
                        img[(i - 1) * C + j + 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                    Iy[i - bot0][j - left0] = img[(i - 1) * C + j - 1] * (-0.0833333333333f) +
                        img[(i - 1) * C + j + 1] * 0.0833333333333f +
                        img[i * C + j - 1] * -0.166666666667f +
                        img[i * C + j + 1] * 0.166666666667f +
                        img[(i + 1) * C + j - 1] * -0.0833333333333f +
                        img[(i + 1) * C + j + 1] * 0.0833333333333f;

                }
            }

           // for (int i = idy + bot0;i < (float)top0; i += gridDim.y * blockDim.y)
            for (int i = bot0; i < top0; i++)
            {
                for (int j = left0; j < right0; j++)
                {
                    int newI = i - bot0;
                    int newJ = j - left0;

                    harris[((i)*C + (j))] = cudafilter2sq(Ix, Ix, newI, newJ) * cudafilter2sq(Iy, Iy, newI, newJ) -
                        cudafilter2sq(Ix, Iy, newI, newJ) * cudafilter2sq(Ix, Iy, newI, newJ) -
                        (0.04f * (cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ))) *
                        (cudafilter2sq(Ix, Ix, newI, newJ) + cudafilter2sq(Iy, Iy, newI, newJ));
                }
            }

        }
    }


}


int main(int argc, char** argv)
{
    int i, j, run;                // looping variables
    int R, C, nruns;              // height, width and number of loops runs
    double begin, end;            // each loop start time and end time
    double init, finish;          // total loop start time and end time
    double stime, avgt;           // time used and total avgt time
    cv::Mat image, loaded_data;
    cv::Scalar sc;
    cv::Size size;

    float* t_res;
    float* t_data;

    // Might be unused depending on preprocessor macro definitions
    (void)t_res;
    (void)t_data;
    (void)loaded_data;

    float* data;
    float* res;

    if (argc != 3)
    {
        printf("Does not set the NRuns and image needed\n");
        return -1;
    }

    image = cv::imread(argv[1], 1);   // read image from command line argument [1]

    if (!image.data)
    {
        printf("No image data ! Are you sure %s is an image ?\n", argv[1]);
        return -1;
    }

    // Convert image input to grayscale floating point
    cv::cvtColor(image, image, cv::COLOR_BGR2GRAY);
    size = image.size();
    C = size.width;
    R = size.height;

    printf("Values settings :\n");
    printf("-------------------\n");
    printf("Image Used : %s [%i, %i] \n", argv[1], R, C);

    res = (float*)calloc(R * C, sizeof(*res));

    if (res == NULL)
    {
        printf("Error while allocating result table of size %ld B\n",
            (sizeof(*res) * C * R));
        return -1;
    }

    data = (float*)malloc(R * C * sizeof(float));
    for (i = 0; i < R; i++) {
        for (j = 0; j < C; j++) {
            sc = image.at<uchar>(i, j);
            data[i * C + j] = (float)sc.val[0] / 255;
        }
    }

    // Parallel Running Test
    printf("\n\n-----------------------------------\n");
    printf("Cuda\n");
    printf("-----------------------------------\n");
    res = (float*)calloc(R * C, sizeof(*res));                // reset resources value

    dim3 grid(2,2,2);
    dim3 block(16,16,1);

    // Data required to pass to device
    float* img, * harris;

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaMalloc((void**)&img, R * C * sizeof(*img));
    cudaMalloc((void**)&harris, R * C * sizeof(*harris));


    cudaMemcpy(img, data, C * R * sizeof(*data), cudaMemcpyHostToDevice);   // pass image value to the GPU

    cudaEventRecord(start);
    cudapipeline_harris << < grid, block >> > (C, R, img, harris);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);
    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);
    //cudapipeline_harris << < 1, 8 >> > (C, R, img, harris);

   cudaDeviceSynchronize();


    cudaMemcpy(res, harris, C * R * sizeof(*harris), cudaMemcpyDeviceToHost);
    cudaError_t err = cudaGetLastError();
    if (err != cudaSuccess)
    {
        printf("CUDA ERROR : %s", cudaGetErrorString(err));
        exit(-1);
    }

    printf("Total time   :  \t %f ms\n", milliseconds);


#ifdef CHECK_FINAL_RESULT
    // Serial Show input
    cv::namedWindow("Input", cv::WINDOW_NORMAL);
    cv::imshow("Input", image);
    image.release();
    // Parallel Show output
    cv::Mat imres = cv::Mat(R, C, CV_32F, res);
    cv::namedWindow("Parallel Output", cv::WINDOW_NORMAL);
    cv::imshow("Parallel Output", imres * 65535.0);
    imres.release();
#endif

    cudaFree(harris);
    cudaFree(img);

    free(data);
    free(res);
    return 0;
}

这是显示的错误:

CUDA ERROR : an illegal memory access was encountered

**CUDA ERROR : unspecified launch failure========= Invalid __global__ read of size 4
=========     at 0x000002d0 in C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:383:cudafilter2sq(float[258]*, float[258]*, int, int)
=========     by thread (15,1,0) in block (0,0,1)
=========     Address 0x2c6f5fee774 is out of bounds
=========     Device Frame:C:/Users/Jiayih/source/repos/cuda/cuda/main.cu:453:cudapipeline_harris(int, int, float*, float*) (cudapipeline_harris(int, int, float*, float*) : 0x2130)
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x81dcd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x82167]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x8686e]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0x11473a) [0x3322ba]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x176ea9]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuProfilerStop + 0xe97c2) [0x307342]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x361bd]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x365e1]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll [0x368c4]
=========     Host Frame:C:\WINDOWS\system32\DriverStore\FileRepository\nvamui.inf_amd64_544551ddc9ce8575\nvcuda64.dll (cuLaunchKernel + 0x234) [0x20d954]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8dba]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll [0x8c66]
=========     Host Frame:C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin\cudart64_110.dll (cudaLaunchKernel + 0x1c4) [0x29024]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x1f) [0x516f]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__device_stub__Z19cudapipeline_harrisiiPfS_ + 0x22e) [0x4fbe]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (cudapipeline_harris + 0x41) [0x44c1]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (main + 0x577) [0x4a47]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (invoke_main + 0x39) [0xfa79]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main_seh + 0x12e) [0xf95e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (__scrt_common_main + 0xe) [0xf81e]
=========     Host Frame:C:\Users\Jiayih\source\repos\cuda\x64\Debug\cuda.exe (mainCRTStartup + 0x9) [0xfb09]
=========     Host Frame:C:\WINDOWS\System32\KERNEL32.DLL (BaseThreadInitThunk + 0x14) [0x17bd4]
=========     Host Frame:C:\WINDOWS\SYSTEM32\ntdll.dll (RtlUserThreadStart + 0x21) [0x6ce51]
=========**

这里的调试过程相当简单。您的 CUDA 错误输出指向 cudafilter2sq 中的 out-of-range 访问错误,如下所示:

Invalid global read of size 4 ========= at ...cuda/main.cu:383:cudafilter2sq
... Address ... is out of bounds

查看 cudafilter2sq,问自己一个问题“其中一个访问怎么会超出范围?”由于该函数相当简单,答案是,“如果其中一个索引(从 ij 计算)超出了 A/IxB/Iy。然后你只需根据已知的可能范围(0-33、0-257)测试这些计算出的索引。

很明显,cudafilter2sq 需要一个大于 0 的 i 值,否则 i-1 将索引超出范围。但是你不满足这个要求。添加:

 #include <assert.h> 

然后添加:

 assert(i > 0); 

cudafilter2sq 的开头。然后 运行 您的代码启用了内存检查功能(就像您已经在做的那样)。您将命中这些设备断言,表明您正在索引 out-of-range。 j.

你也有同样的问题

当我在cudafilter2sq开头添加如下代码时:

if (i < 1) i = 1;  if (j < 1) j = 1;

你的代码 运行 对我来说没有错误。很明显,如果您的 cudapipeline_harris 内核 for-loop 以:

开头
 ...int i = bot0;...

然后:

 int newI = i - bot0; 

可以为 newI 生成零值(对于 newJ 也是如此)。所以这似乎是索引问题的“根源”。我想你可以从这里修复它。

此外,请注意您对 cudafilter2sq:

的前向声明
__device__ float cudafilter2sq(float A[16][34], float B[34][258], int i, int j);

不符合定义

__device__ float cudafilter2sq(float A[34][258], float B[34][258], int i, int j)