OpenCL 二维数组实现

OpenCL 2D Array Implementation

目前正在使用 OpenCL 实现二维数组。在大多数情况下,当 matrix_size 小于 15 或更小时,这一切都很好。当我将它增加到大约 100 时,程序崩溃了。根据 visual studio 调试器,问题似乎是整数除以 0。我不太确定这可能发生在哪里。 我的假设:这是工作项和工作组的问题:

queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);

不幸的是,我不太确定如何解决这个问题。最终,我希望能够 运行 在相对较大的数据集上进行这些基本计算。

int main() {
`   srand((unsigned int)time(NULL));
    const int matrix_size = 100;
    string input;
    string func;
    string input_file;
    cout << "Please enter a arithmetic option: multi or add" << endl;
    cout << ">> ";
    input_file = "MatrixArithmetic.cl";
    getline(cin, input);
    if (input[0] == 'a') {func = "matrix_add";}
    else if (input[0] == 'm') {func = "matrix_multi";}
    else { cout << "Not a valid option... exiting" << endl; return 0; }
    ifstream ArithmeticFile(input_file);
    string src(istreambuf_iterator<char>(ArithmeticFile), (istreambuf_iterator<char>()));


    //prepare platform
    vector<cl::Platform> platforms;
    cl::Platform::get(&platforms);
    auto platform = platforms.front();
    //gather device info from platform and store into devices vector
    vector<cl::Device> devices;
    platform.getDevices(CL_DEVICE_TYPE_GPU, &devices);
    //chose device for computation
    auto device = devices.front();
    cout << "Using device: " << device.getInfo<CL_DEVICE_NAME>() << endl;
    /*cout << "This: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;
    cout << "This 2: " << CL_DEVICE_MAX_MEM_ALLOC_SIZE << endl;*/
    //setup the context

    cl::Program::Sources sources;
    sources.push_back({ src.c_str(), src.length() });
    cl::Context context(device);
    cl::Program program(context, sources);
    auto err = program.build("-cl-std=CL1.2");
    //setup kernel (this is kernel specific)
    cl::Buffer buffer_A(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    cl::Buffer buffer_B(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    cl::Buffer buffer_C(context, CL_MEM_READ_WRITE, sizeof(float)*matrix_size*matrix_size);
    //build and seed matrix using random for computation this is done on the main processor
    float vec1[matrix_size][matrix_size];
    float vec2[matrix_size][matrix_size];
    for (int x = 0; x < matrix_size; x++) {
        for (int y = 0; y < matrix_size; y++) {
            vec1[x][y] = (float)(1+rand()%(rand()%1000));
            vec2[x][y] = (float)(1+rand()%(rand()%1000));
        }
    }
    //queue setup for pushing commands to device
    cl::CommandQueue queue(context, device);
    //write vec1 and vec2 to device
    queue.enqueueWriteBuffer(buffer_A, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec1);
    queue.enqueueWriteBuffer(buffer_B, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec2);
    ////run the kernel
    cl::Kernel kernel = cl::Kernel(program, func.c_str());
    //pushing argument to kernel it has 3 total arguments
    kernel.setArg(0, buffer_A);
    kernel.setArg(1, buffer_B);
    kernel.setArg(2, buffer_C);
    queue.enqueueNDRangeKernel(kernel, cl::NullRange, cl::NDRange(matrix_size*matrix_size), cl::NullRange);
    queue.finish();
    //writing to the buffer 
    float vec3[matrix_size][matrix_size];
    queue.enqueueReadBuffer(buffer_C, CL_TRUE, 0, sizeof(float)*matrix_size*matrix_size, vec3);
    cin.get();
    return 0;
}


Kernel:

    __kernel void matrix_add(__global const float *A, __global float *B, 

    __global float *C)
    {
        //index of the current element
        int x = get_global_id(0);
        //operation
        C[x] = A[x] + B[x];
    }

    __kernel void matrix_multi(__global const float *A, __global const float *B, __global float *C)
    {
        //OpenCL does not take 2D arrays it had to be flatten
        //index of the current element
        int x = get_global_id(0);
        //operation
        C[x] = A[x] * B[x];
}

规格:i5 4690K 和 AMD r9 290 8GB 内存。所以内存应该不是问题,每个 "Matrix" 在 100x100

的大小下应该占用大约 4000 个字节

罪魁祸首: rand()%1000 有时评估为 0。

vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));

更改为:

vec1[x][y] = (float)(1+rand()%(1+rand()%1000));
vec2[x][y] = (float)(1+rand()%(1+rand()%1000));

您应该使用包装器将一维数组(或分配)视为二维数组:

MDArr buf = new MDArr(100,100); // allocates 10k memory and gets its pointer
buf[10][90]=3; // this is overloaded operator usage for host-side
clEnqueueWriteBuffer(.,..,buf.ptr()) // 0th addreess of contiguous mem
                                    // same as &vec1[0][0]

所以它不会访问非连续的禁区。

编辑:100x100 浮点数组必须是 4*100*100 = x86 的 40k 内存区域cpu