什么时候或为什么 clEnqueueNDRangeKernel return Null 作为事件?

When or why does clEnqueueNDRangeKernel return Null as event?

您好,我正在尝试在 7 周内执行《7 并发模型》一书中的示例代码。作者使用的是 macbook,而我使用的是带 windows 10 的 dell xps。

我的程序崩溃了,因为在我调用函数 clEnqueueNDRangeKernel()timing_event 仍然是 null。

cl_event timing_event;
size_t work_units = NUM_ELEMENTS;
clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
    NULL, 0, NULL,&timing_event);

docs说明事件参数

event

Returns an event object that identifies this particular kernel execution instance. Event objects are unique and can be used to identify a particular kernel execution instance later on. If event is NULL, no event will be created for this kernel execution instance and therefore it will not be possible for the application to query or queue a wait for this particular kernel execution instance.

谁能解释一下为什么我的戴尔会出现这种情况,而作者的 macbook 却没有?

我找到了解决方案。问题并非来自 clEnqueueNDRangeKernel(),它发生在 clBuildProgram(program, 0, NULL, NULL, NULL, NULL); 早些时候。我用 clGetProgramBuildInfo() 检索了构建信息。问题是我的 multiply_arrays.cl 文件不是 utf 8 编码


对于所有刚接触 opencl 的人。每个 opencl 函数 returns 是一个映射到特定错误代码的状态整数。如果函数 return 没有 return 状态代码,则可以将指针传递给函数。请参阅我在下面链接的示例函数。这对调试你的程序很有帮助。

Returns Status Code

Status Code by Reference

main.cpp

/***
* Excerpted from "Seven Concurrency Models in Seven Weeks",
* published by The Pragmatic Bookshelf.
* Copyrights apply to this code. It may not be used to create training material,
* courses, books, articles, and the like. Contact us if you are in doubt.
* We make no guarantees that this code is fit for any purpose.
* Visit http://www.pragmaticprogrammer.com/titles/pb7con for more book information.
***/
#ifdef __APPLE__
#include <OpenCL/cl.h>
#include <mach/mach_time.h>
#else  
#include <CL/cl.h>
#include <Windows.h>

#endif

#include <stdio.h>
#include<iostream>
#include <inttypes.h>
#include <chrono>

#define NUM_ELEMENTS (100000)

char* read_source(const char* filename) {
    FILE *h = fopen(filename, "r");
    fseek(h, 0, SEEK_END);
    size_t s = ftell(h);
    rewind(h);
    char* program = (char*)malloc(s + 1);
    fread(program, sizeof(char), s, h);
    program[s] = '[=10=]';
    fclose(h);
    return program;
}

void random_fill(cl_float array[], size_t size) {
    for (int i = 0; i < size; ++i)
        array[i] = (cl_float)rand() / RAND_MAX;
}

int main() {
    //Status for Errorhandling
    cl_int status;

    //Identify Platform
    cl_platform_id platform;
    clGetPlatformIDs(1, &platform, NULL);

    //Get Id of GPU
    cl_device_id device;
    cl_uint num_devices = 0;
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices);

    // Create Context
    cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);


    //Use context to create Command Queue
    //Que enables us to send commands to the gpu device
    cl_command_queue queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL);

    //Load Kernel
    char* source = read_source("multiply_arrays.cl");
    cl_program program = clCreateProgramWithSource(context, 1,
        (const char**)&source, NULL, &status);
    free(source);

    // Build Program
    status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL);

    size_t len;
    char *buffer;
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len);
    buffer = (char *) malloc(len);
    clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, len, buffer, NULL);
    printf("%s\n", buffer);

    //Create Kernel
    cl_kernel kernel = clCreateKernel(program, "multiply_arrays", &status);

    // Create Arrays with random Numbers
    cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS];
    random_fill(a, NUM_ELEMENTS);
    random_fill(b, NUM_ELEMENTS);

    //uint64_t startGPU = mach_absolute_time();
    auto start = std::chrono::high_resolution_clock::now();


    //Create Readonly input Buffers with value from a and b
    cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_float) * NUM_ELEMENTS, a, NULL);
    cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR,
        sizeof(cl_float) * NUM_ELEMENTS, b, NULL);

    //Create Output buffer write Only
    cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY,
        sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL);

    //set Kernel Arguments
    clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB);
    clSetKernelArg(kernel, 2, sizeof(cl_mem), &output);

    cl_event timing_event;
    size_t work_units = NUM_ELEMENTS;
    status = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units,
        NULL, 0, NULL,&timing_event);

    cl_float results[NUM_ELEMENTS];
    //Calculate Results and copy from output buffer to results
    clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS,
        results, 0, NULL, NULL);

    //uint64_t endGPU = mach_absolute_time();
    auto finish = std::chrono::high_resolution_clock::now();
    //printf("Total (GPU): %lu ns\n\n", (unsigned long)(endGPU - startGPU));
    std::cout << "Total(GPU) :"<< std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";

    cl_ulong starttime;
    clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START,
        sizeof(cl_ulong), &starttime, NULL);
    cl_ulong endtime;
    clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END,
        sizeof(cl_ulong), &endtime, NULL);
    printf("Elapsed (GPU): %lu ns\n\n", (unsigned long)(endtime - starttime));
    clReleaseEvent(timing_event);
    clReleaseMemObject(inputA);
    clReleaseMemObject(inputB);
    clReleaseMemObject(output);
    clReleaseKernel(kernel);
    clReleaseProgram(program);
    clReleaseCommandQueue(queue);
    clReleaseContext(context);

    //uint64_t startCPU = mach_absolute_time();
    start = std::chrono::high_resolution_clock::now();
    for (int i = 0; i < NUM_ELEMENTS; ++i)
        results[i] = a[i] * b[i];

    //uint64_t endCPU = mach_absolute_time();
    finish = std::chrono::high_resolution_clock::now();
    //printf("Elapsed (CPU): %lu ns\n\n", (unsigned long)(endCPU - startCPU));
    std::cout << "Elapsed (CPU) :" << std::chrono::duration_cast<std::chrono::nanoseconds>(finish - start).count() << "ns\n";
    return 0;
}

multiply_arrays.cl

__kernel void multiply_arrays(__global const float* inputA,
    __global const float* inputB,
    __global float* output) {

    int i = get_global_id(0);
    output[i] = inputA[i] * inputB[i];
}
//ö