OpenCL C 程序给出奇怪的输出
OpenCL C program gives bizzare output
我写了一个简单的 OpenCL C 代码。它的内核代码是:
__kernel void hello(__global int * A,__global int * B)
{
int x=get_global_id(0);
B[x]=x;
A[x]+=1;
}
以下是主机代码的一部分:
int main()
{
cl_platform_id* platforms=NULL;
cl_device_id* devices=NULL;
cl_uint ret,platformCount,deviceCount;
cl_context context = NULL;
cl_command_queue command_queue=NULL;
char* name;
int i,j,l;
size_t size;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
int array_size=10;
int *A=(int*)malloc(array_size*sizeof(int));
int *B=(int*)malloc(array_size*sizeof(int));
printf("\nhey");
for(i=0;i<array_size;i++)
A[i]=0;
ret=clGetPlatformIDs(0,NULL,&platformCount);
printf("\n-----------------Found %d platforms-----------\n",platformCount);
platforms=(cl_platform_id*)malloc(sizeof(cl_platform_id)*platformCount);
ret=clGetPlatformIDs(platformCount,platforms,NULL);
if(ret==CL_SUCCESS)
printf("\nPlatform ids obtained successfully!");
for(i=platformCount-1;i>=0;i--)
{
char * platformname;
printf("\n-------------In platform %d-----------------",i);
ret=clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME ,0,NULL,&size);
platformname=(char*)malloc(sizeof(char)*size);
ret=clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME ,size,platformname,NULL);
if(ret==CL_SUCCESS)
printf("\nPlatform info obtained successfully!");
printf("\n----------------For %s-----------------------",platformname);
ret=clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_DEFAULT,NULL,NULL,&deviceCount);
devices=(cl_device_id*)malloc(sizeof(cl_device_id)*deviceCount);
ret=clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_DEFAULT,deviceCount,devices,NULL);
if(ret==CL_SUCCESS)
printf("\nFound %d devices!",deviceCount);
for(j=0;j<deviceCount;j++)
{
char*devicename;
printf("\n-------------Device %d.%d-----------------",i,j+1);
ret=clGetDeviceInfo(devices[j], CL_DEVICE_NAME ,0,NULL,&size);
devicename=(char*)malloc(sizeof(char)*size);
ret=clGetDeviceInfo(devices[j], CL_DEVICE_NAME ,size,devicename,NULL);
if(ret==CL_SUCCESS)
printf("\nDevice info obtained successfully!");
printf("\nThe device name is %s && size=%d\n",devicename,size);
printf("\nFound %d corresponding devices",deviceCount);
context=clCreateContext(NULL,1,&devices[j], NULL, NULL, &ret);
if(ret==CL_SUCCESS)
printf("\nContext created successfully");
command_queue=clCreateCommandQueue(context,devices[j], CL_QUEUE_PROFILING_ENABLE,&ret);
if(ret==CL_SUCCESS)
printf("\nCommand queue created successfully");
memobj=clCreateBuffer(context, CL_MEM_READ_WRITE ,array_size*sizeof(int),NULL,&ret);
if(ret==CL_SUCCESS)
printf("\nMemory object 1 created successfully");
memobj1=clCreateBuffer(context, CL_MEM_READ_WRITE ,array_size*sizeof(int),NULL,&ret);
if(ret==CL_SUCCESS)
printf("\nMemory object 2 created successfully");
ret=clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, array_size*sizeof(int), A, 0, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nData written into buffer1 successfully");
ret=clEnqueueWriteBuffer(command_queue, memobj1, CL_TRUE, 0, array_size*sizeof(int),B, 0, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nData written into buffer2 successfully");
FILE *fp;
char fileName[] = "./6.cl";
char *source_str;
size_t source_size;
/* Load kernel code */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
printf("\nThe program is \n%s\n",source_str);
fclose(fp);
program=clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
if(ret==CL_SUCCESS)
printf("\nProgram created successfully");
ret = clBuildProgram(program, 1, &devices[j], NULL, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nProgram built successfully");
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_STATUS ,0,NULL,&size);
printf("\n Program buildinfo status=%d",ret);
cl_build_status *status=(cl_build_status *)malloc(sizeof(cl_build_status )*size);
clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_STATUS ,size,status,NULL);
printf("\nBuild status=%d\n",*status);
printf("\nBuild log i=%d, j=%d",i,j);
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_LOG ,0,NULL,&size);
printf("\nclGetProgramBuildInfo ret1=%d",ret);
char buildlog[2048];
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_LOG ,sizeof(buildlog),buildlog,NULL);
printf("\nclGetProgramBuildInfo ret2=%d",ret);
printf("\n!!!!!!!!!!!!!!!!!!!!!Program ended!!!!!!!!!!!\n");
printf("\n\nBuildlog: %s\n\n",buildlog);
kernel = clCreateKernel(program, "hello", &ret);
if(ret==CL_SUCCESS)
printf("\nKernel created successfully");
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobj);
printf("\nKernel argument 1=%d",ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobj1);
printf("\nKernel argument 2=%d",ret);
cl_uint work_dim = 1;
size_t global_item_size=array_size;
size_t local_item_size=32;
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,&event);
if(ret==CL_SUCCESS)
printf("\nKernel executed successfully");
//ret=clEnqueueTask(command_queue,kernel,0,NULL,NULL);
clWaitForEvents(1, &event);//make sure kernel has finished
clFinish(command_queue);//make sure all enqueued tasks finished
//get the profiling data and calculate the kernel execution time
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = (cl_double)(time_end - time_start)/1000000.0;
printf("OpenCl Execution time is: %10.5f[ms] \n",total_time);
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0,array_size * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0,array_size * sizeof(int), A, 0, NULL, NULL);
int t;
printf("\nThe result is:");
for(t=0;t<array_size;t++)
printf("\t%d",B[t]);
printf("\nThe result A is:");
for(t=0;t<array_size;t++)
printf("\t%d",A[t]);
//Read file here
}
}
return 0;
}
我面临几个问题:
- 程序输出数组 B 和 returns 数组 A 未修改的垃圾值。
- 如果不是
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,1,NULL,&event);
我使用ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,NULL);
,程序输出正确的结果。我在 2 个 GPU 和一个 CPU 上 运行 这段代码。如果我使用后一种形式的 clEnqueueNDRangeKernel,即没有分析,我在 GPU 中得到正确的输出,而在 CPU 的情况下是垃圾值。
- 该代码针对特定平台列出的设备少于实际可用的设备。
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,1,NULL,&event);
您正在传递一个空的 (NULL
) 事件等待列表,但声称其中有 1 个事件。这将失败,如果您正在检查 OpenCL API 调用的错误代码(您应该 总是 这样做),您可能会得到 CL_INVALID_EVENT_WAIT_LIST
回来,这会指出你的问题。
如果您只想从内核中取回事件(例如用于分析),但不想将任何事件依赖项传递给它,那么正确的形式是这样的:
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,&event);
您提供的代码的第二个问题是您的工作组大小并没有完全除以全局大小:
size_t global_item_size=array_size; // which is 10
size_t local_item_size=32;
这将导致 clEnqueueNDRangeKernel
到 return CL_INVALID_WORK_GROUP_SIZE
,并且无法使任何工作入队。
The code lists fewer devices for a particular platform than those which are actually available.
您正在请求 CL_DEVICE_TYPE_DEFAULT
类型的所有设备。如果你真的想要 所有 的设备,那么使用 CL_DEVICE_TYPE_ALL
。如果您只想要 GPU 设备,请请求 CL_DEVICE_TYPE_GPU
.
我写了一个简单的 OpenCL C 代码。它的内核代码是:
__kernel void hello(__global int * A,__global int * B)
{
int x=get_global_id(0);
B[x]=x;
A[x]+=1;
}
以下是主机代码的一部分:
int main()
{
cl_platform_id* platforms=NULL;
cl_device_id* devices=NULL;
cl_uint ret,platformCount,deviceCount;
cl_context context = NULL;
cl_command_queue command_queue=NULL;
char* name;
int i,j,l;
size_t size;
cl_mem memobj = NULL;
cl_mem memobj1 = NULL;
cl_program program = NULL;
cl_kernel kernel = NULL;
int array_size=10;
int *A=(int*)malloc(array_size*sizeof(int));
int *B=(int*)malloc(array_size*sizeof(int));
printf("\nhey");
for(i=0;i<array_size;i++)
A[i]=0;
ret=clGetPlatformIDs(0,NULL,&platformCount);
printf("\n-----------------Found %d platforms-----------\n",platformCount);
platforms=(cl_platform_id*)malloc(sizeof(cl_platform_id)*platformCount);
ret=clGetPlatformIDs(platformCount,platforms,NULL);
if(ret==CL_SUCCESS)
printf("\nPlatform ids obtained successfully!");
for(i=platformCount-1;i>=0;i--)
{
char * platformname;
printf("\n-------------In platform %d-----------------",i);
ret=clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME ,0,NULL,&size);
platformname=(char*)malloc(sizeof(char)*size);
ret=clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME ,size,platformname,NULL);
if(ret==CL_SUCCESS)
printf("\nPlatform info obtained successfully!");
printf("\n----------------For %s-----------------------",platformname);
ret=clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_DEFAULT,NULL,NULL,&deviceCount);
devices=(cl_device_id*)malloc(sizeof(cl_device_id)*deviceCount);
ret=clGetDeviceIDs(platforms[i],CL_DEVICE_TYPE_DEFAULT,deviceCount,devices,NULL);
if(ret==CL_SUCCESS)
printf("\nFound %d devices!",deviceCount);
for(j=0;j<deviceCount;j++)
{
char*devicename;
printf("\n-------------Device %d.%d-----------------",i,j+1);
ret=clGetDeviceInfo(devices[j], CL_DEVICE_NAME ,0,NULL,&size);
devicename=(char*)malloc(sizeof(char)*size);
ret=clGetDeviceInfo(devices[j], CL_DEVICE_NAME ,size,devicename,NULL);
if(ret==CL_SUCCESS)
printf("\nDevice info obtained successfully!");
printf("\nThe device name is %s && size=%d\n",devicename,size);
printf("\nFound %d corresponding devices",deviceCount);
context=clCreateContext(NULL,1,&devices[j], NULL, NULL, &ret);
if(ret==CL_SUCCESS)
printf("\nContext created successfully");
command_queue=clCreateCommandQueue(context,devices[j], CL_QUEUE_PROFILING_ENABLE,&ret);
if(ret==CL_SUCCESS)
printf("\nCommand queue created successfully");
memobj=clCreateBuffer(context, CL_MEM_READ_WRITE ,array_size*sizeof(int),NULL,&ret);
if(ret==CL_SUCCESS)
printf("\nMemory object 1 created successfully");
memobj1=clCreateBuffer(context, CL_MEM_READ_WRITE ,array_size*sizeof(int),NULL,&ret);
if(ret==CL_SUCCESS)
printf("\nMemory object 2 created successfully");
ret=clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, array_size*sizeof(int), A, 0, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nData written into buffer1 successfully");
ret=clEnqueueWriteBuffer(command_queue, memobj1, CL_TRUE, 0, array_size*sizeof(int),B, 0, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nData written into buffer2 successfully");
FILE *fp;
char fileName[] = "./6.cl";
char *source_str;
size_t source_size;
/* Load kernel code */
fp = fopen(fileName, "r");
if (!fp) {
fprintf(stderr, "Failed to load kernel.\n");
exit(1);
}
source_str = (char*)malloc(MAX_SOURCE_SIZE);
source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp);
printf("\nThe program is \n%s\n",source_str);
fclose(fp);
program=clCreateProgramWithSource(context, 1, (const char **)&source_str,(const size_t *)&source_size, &ret);
if(ret==CL_SUCCESS)
printf("\nProgram created successfully");
ret = clBuildProgram(program, 1, &devices[j], NULL, NULL, NULL);
if(ret==CL_SUCCESS)
printf("\nProgram built successfully");
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_STATUS ,0,NULL,&size);
printf("\n Program buildinfo status=%d",ret);
cl_build_status *status=(cl_build_status *)malloc(sizeof(cl_build_status )*size);
clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_STATUS ,size,status,NULL);
printf("\nBuild status=%d\n",*status);
printf("\nBuild log i=%d, j=%d",i,j);
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_LOG ,0,NULL,&size);
printf("\nclGetProgramBuildInfo ret1=%d",ret);
char buildlog[2048];
ret=clGetProgramBuildInfo(program,devices[j], CL_PROGRAM_BUILD_LOG ,sizeof(buildlog),buildlog,NULL);
printf("\nclGetProgramBuildInfo ret2=%d",ret);
printf("\n!!!!!!!!!!!!!!!!!!!!!Program ended!!!!!!!!!!!\n");
printf("\n\nBuildlog: %s\n\n",buildlog);
kernel = clCreateKernel(program, "hello", &ret);
if(ret==CL_SUCCESS)
printf("\nKernel created successfully");
ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobj);
printf("\nKernel argument 1=%d",ret);
ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobj1);
printf("\nKernel argument 2=%d",ret);
cl_uint work_dim = 1;
size_t global_item_size=array_size;
size_t local_item_size=32;
cl_event event;
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,&event);
if(ret==CL_SUCCESS)
printf("\nKernel executed successfully");
//ret=clEnqueueTask(command_queue,kernel,0,NULL,NULL);
clWaitForEvents(1, &event);//make sure kernel has finished
clFinish(command_queue);//make sure all enqueued tasks finished
//get the profiling data and calculate the kernel execution time
cl_ulong time_start, time_end;
double total_time;
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL);
clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL);
total_time = (cl_double)(time_end - time_start)/1000000.0;
printf("OpenCl Execution time is: %10.5f[ms] \n",total_time);
ret = clEnqueueReadBuffer(command_queue, memobj1, CL_TRUE, 0,array_size * sizeof(int), B, 0, NULL, NULL);
ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0,array_size * sizeof(int), A, 0, NULL, NULL);
int t;
printf("\nThe result is:");
for(t=0;t<array_size;t++)
printf("\t%d",B[t]);
printf("\nThe result A is:");
for(t=0;t<array_size;t++)
printf("\t%d",A[t]);
//Read file here
}
}
return 0;
}
我面临几个问题:
- 程序输出数组 B 和 returns 数组 A 未修改的垃圾值。
- 如果不是
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,1,NULL,&event);
我使用ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,NULL);
,程序输出正确的结果。我在 2 个 GPU 和一个 CPU 上 运行 这段代码。如果我使用后一种形式的 clEnqueueNDRangeKernel,即没有分析,我在 GPU 中得到正确的输出,而在 CPU 的情况下是垃圾值。 - 该代码针对特定平台列出的设备少于实际可用的设备。
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,1,NULL,&event);
您正在传递一个空的 (NULL
) 事件等待列表,但声称其中有 1 个事件。这将失败,如果您正在检查 OpenCL API 调用的错误代码(您应该 总是 这样做),您可能会得到 CL_INVALID_EVENT_WAIT_LIST
回来,这会指出你的问题。
如果您只想从内核中取回事件(例如用于分析),但不想将任何事件依赖项传递给它,那么正确的形式是这样的:
ret = clEnqueueNDRangeKernel(command_queue, kernel, work_dim, NULL,&global_item_size, &local_item_size,0,NULL,&event);
您提供的代码的第二个问题是您的工作组大小并没有完全除以全局大小:
size_t global_item_size=array_size; // which is 10
size_t local_item_size=32;
这将导致 clEnqueueNDRangeKernel
到 return CL_INVALID_WORK_GROUP_SIZE
,并且无法使任何工作入队。
The code lists fewer devices for a particular platform than those which are actually available.
您正在请求 CL_DEVICE_TYPE_DEFAULT
类型的所有设备。如果你真的想要 所有 的设备,那么使用 CL_DEVICE_TYPE_ALL
。如果您只想要 GPU 设备,请请求 CL_DEVICE_TYPE_GPU
.