OpenCL入门程序

来源：互联网发布：网络银行的运作模式编辑：程序博客网时间：2024/05/18 02:04

注意：如果是从显存到显存不是用writebuffer而是用copybuffer

以前就听说OPenCL，今天就特地使用了一下，我的机器是N卡，首先装上了CUDA的开发包，由于CUDA对OPenCL支持比较好，就选择了N卡上的GPU并行计算。

OPenCL是一个开放的标准和规范，全程是开放计算库，主要是发挥计算机的所有计算资源，包括CPU、GPU、多核等。所以说OPenCL是一个跨硬件和软件平台的开放标准，在此框架下开发的并行计算程序很容易就能移植到其他平台上，也许是这样吧。其实，关于GPU的并行计算的大致思路一般都是CPU向GPU发送一个计算指令，然后把数据拷贝的GPU的显存中参与计算，然后将计算好的显存中的数据拷贝到主机内存中，虽然说，过程大概就是这样，但是其中涉及到的细节可是特别多。下面就以一个简单的例子为例讲述OPenCL编程开发的一般步骤和模型。

第一步，首先获得可以参与计算的OPenCL平台个数

cl_uint numPlatforms = 0;   //GPU计算平台个数  cl_platform_id platform = NULL;  clGetPlatformIDs(0,NULL,&numPlatforms);

第二步，获得平台的列表，并选择其中的一个作为计算的平台

//获得平台列表  cl_platform_id * platforms = (cl_platform_id*)malloc(numPlatforms * sizeof(cl_platform_id));  clGetPlatformIDs (numPlatforms, platforms, NULL);  //轮询各个opencl设备  for (cl_uint i = 0; i < numPlatforms; i ++)  {    char pBuf[100];    clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,sizeof(pBuf),pBuf,NULL);    printf("%s\n",pBuf);    platform = platforms[i];  }  free(platforms);

第三步，获得硬件设备以及生成上下文

//获得GPU设备  cl_device_id device;  status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL);  //生成上下文  cl_context context = clCreateContext(0, 1, &device, NULL, NULL, &status);

至此，OPenCL的初始化工作已经完成，最好将这个过程封装成一个函数。

第四步，装载内核程序代码以及生成program

//装载内核程序  size_t szKernelLength = 0;  size_t sourceSize[] = {strlen(kernelSourceCode1)};  char *cFileName = "kernel.cl";  char * cPathAndName= shrFindFilePath(cFileName, argv[0]);  const char* kernelSourceCode = oclLoadProgSource(cPathAndName, "", &szKernelLength);  cl_program program = clCreateProgramWithSource(context,1,&kernelSourceCode,&szKernelLength,&status);  //为所有指定的设备生成CL_program  status = clBuildProgram(program,1,&device,NULL,NULL,NULL);    size_t len = 0;  char buf[2048];  if (status != CL_SUCCESS)  {    status = clGetProgramBuildInfo(program,device,CL_PROGRAM_BUILD_LOG,sizeof(buf),buf,&len);    printf("%s\n", buf);    exit(1);  }

第五步，创建一个命令队列，将这个命令队列放入内核程序中执行

//创建一个opencl命令队列  cl_command_queue commandQueue = clCreateCommandQueue(context,device,0,&status);  //创建opencl buffer对象  cl_mem outputBuffer = clCreateBuffer(context,CL_MEM_ALLOC_HOST_PTR,4*4*4,NULL,&status);  //得到指定名字的内核实例句柄  cl_kernel kernel = clCreateKernel(program,"hellocl",&status);  //为内核程序设置相应的参数,也就是函数传参  status = clSetKernelArg(kernel,0,sizeof(cl_mem),&outputBuffer);  //将一个kernel放入队列  size_t globalThreads[] = {4,4};  size_t localThreads[] = {2,2};  //开始在设备上执行核函数  status = clEnqueueNDRangeKernel(commandQueue,kernel,2,NULL,    globalThreads,localThreads,0,NULL,NULL);  status = clFinish(commandQueue);

第六步，将计算结果拷贝到主存中

//将GPU本地内存中的数据拷回到host端的内存中  unsigned int *outbuffer = new unsigned int[4*4];  memset(outbuffer,0,4*4*4);  status = clEnqueueReadBuffer(commandQueue,outputBuffer,    CL_TRUE,0,4*4*4,outbuffer,0,NULL,NULL);

第七步，显示及清理内存

printf("out:\n");  for (int i = 0; i < 16; i ++)  {    printf("%x ",outbuffer[i]);    if ((i+1)%4 == 0)    {      printf("\n");    }  }  //清理部分  status = clReleaseKernel(kernel);  status = clReleaseProgram(program);  status = clReleaseMemObject(outputBuffer);  status = clReleaseCommandQueue(commandQueue);  status = clReleaseContext(context);  delete outbuffer;

核函数如下：

__kernel void hellocl (__global uint *buffer){    uint dim = get_work_dim();//获得工作空间的维度信息  size_t gidx,gidy,gidz;  size_t gsizx,gsizy,gsizz;  if(dim == 1)  {    gidx = get_global_id(0);    gsizx = get_global_size(0);    buffer[gidx] = gidx;  }  else if(dim == 2)  {    gidx = get_global_id(0);    gidy = get_global_id(1);    gsizx = get_global_size(0);    gsizy = get_global_size(1);    buffer[gidx+gidy*gsizx] = (1<<gidx)|(0x10<<gidy);;  }  else   {    gidx = get_global_id(0);    gidy = get_global_id(1);    gidy = get_global_id(2);    gsizx = get_global_size(0);    gsizy = get_global_size(1);    gsizz = get_global_size(2);    buffer[gidx + gidy*gsizx + gidz*gsizx*gsizy] = gidx;  }}

运算结果显示如下图：

这只是最简单的程序，复杂算法的并行化还需要深入研究。

0 0