AMD OpenCL例子阅读笔记系列之DeviceFission

来源：互联网发布：记忆曲线软件编辑：程序博客网时间：2024/05/18 12:01

1. 什么是DeviceFission?

DeviceFission就是设备拆分，目前仅支持CPU的拆分。在《OpenCL异构计算》中的第11章有部分内容。其中举得例子是AMD的6核Instanbul x86 CPU,该CPU有6个核，在默认时是作为一个设备看待的，但是通过设备拆分可以将6个核各自看做一个OpenCL设备。每一个设备都可以有一个或多个命令队列，这些命令队列时异步的并且各自在自己的线程种运行。这样可以用设备拆分来创建一个简单强大的基于任务并行的多线程应用程序。

2.例子的功能介绍

该例子的是将CPU设备分成两个，实现的函数为clCreateSubDevices,使用的特性为CL_DEVICE_PARTITION_BY_COUNTS特性。在该例子中只创建了一个缓冲区。由其中的一个子设备来写入初值，之后两个设备同时进行计算，一个子设备调用add内核，一个子设备调用sub内核。

3.内核实现函数

首先看下内核实现：

__kernelvoid Add(__global int* input, __global int* output){    size_t xPos = get_global_id(0);    output[xPos] = input[xPos] + 1;}__kernelvoid Sub(__global int* input, __global int* output){    size_t xPos = get_global_id(0);    output[xPos] = input[xPos] - 1;}

这两个内核都很简单，这里不再解释。

4.主机关键部分解析

intDeviceFission::setupDeviceFission(){    // Make sure length is multiple of group size * numSubDevices    unsigned int mulFactor = (unsigned int)groupSize * numSubDevices;    length = (length < mulFactor) ? mulFactor : length;    length = (length / mulFactor) * mulFactor;    // Calculate half length    half_length = length >> 1;    // Get allocate memory for input buffer    input = (cl_int*)malloc(half_length * sizeof(cl_int));    CHECK_ALLOCATION(input, "Failed to allocate host memory. (input)");    // Random initialisation of input    fillRandom<cl_int>(input, half_length, 1, 1, 8);    // Unless sampleArgs->quiet mode has been enabled, print the INPUT array    if(!sampleArgs->quiet)    {        printArray<cl_int>("Input:", input, half_length, 1);    }    // Get allocate memory for subOutput buffer    subOutput = (cl_int*)malloc(length * sizeof(cl_int));    CHECK_ALLOCATION(subOutput, "Failed to allocate host memory. (subOutput)");    return SDK_SUCCESS;}

输入量初始化部分。

intDeviceFission::setupCLPlatform(){    cl_int status = CL_SUCCESS;    /*     * Have a look at the available platforms and pick either     * the AMD one if available or a reasonable default.     */    cl_platform_id platform = NULL;    int retValue = getPlatform(platform, sampleArgs->platformId,                               sampleArgs->isPlatformEnabled());    CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform(rootplatform) failed");    // Display available devices.    retValue = displayDevices(platform, CL_DEVICE_TYPE_ALL);    CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices(rootplatform) failed");    /*     * If we could find our platform, use it. Otherwise use just available platform.     */    cl_context_properties cps[3] =    {        CL_CONTEXT_PLATFORM,        (cl_context_properties)platform,        0    };    rContext = clCreateContextFromType(platform ? cps : NULL,                                       CL_DEVICE_TYPE_ALL,                                       NULL,                                       NULL,                                       &status);    CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed.");    // getting devices on which to run the sample    status = getDevices(rContext, &Devices, 0, sampleArgs->isDeviceIdEnabled());    CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed");    // Set deviceListSize from clGetContextInfo    status = clGetContextInfo(rContext, CL_CONTEXT_DEVICES, 0, 0, &deviceListSize);    CHECK_ERROR(status, SDK_SUCCESS, "clGetContextInfo failed. (deviceListSize)");    // Get GPU device and CPU devices by the deviceInfo.    for (cl_uint i = 0 ; i < deviceListSize / sizeof(cl_device_id) ; i++)    {        retValue = deviceInfo.setDeviceInfo(Devices[i]);        CHECK_ERROR(retValue, 0, "SDKDeviceInfo::setDeviceInfo() failed");        if (deviceInfo.dType == CL_DEVICE_TYPE_CPU)        {            cpuDevice = Devices[i];        }    }    // Get allocate memory for subDevices    subDevices = (cl_device_id*)malloc(numSubDevices * sizeof(cl_device_id));    CHECK_ALLOCATION(subDevices, "Failed to allocate memory. (subDevices)");    // Get allocate memory for subKernel    subKernel = (cl_kernel*)malloc(numSubDevices * sizeof(cl_kernel));    CHECK_ALLOCATION(subKernel, "Failed to allocate memory. (subKernel)");    // Get maxSubDevices from clGetDeviceInfo    cl_uint maxSubDevices;    status = clGetDeviceInfo(cpuDevice, CL_DEVICE_PARTITION_MAX_SUB_DEVICES,                             sizeof(maxSubDevices), &maxSubDevices, NULL);    CHECK_OPENCL_ERROR(status, "clGetDeviceInfo failed. (maxSubDevices)")    if(maxSubDevices <= 1)    {        std::cout<<"Error: The CPU should have more than one core to run this sample."<<std::endl;        return SDK_FAILURE;    }    // Initialize required partition property    cl_device_partition_property partitionPrty[5] =    {        CL_DEVICE_PARTITION_BY_COUNTS,        maxSubDevices / 2, maxSubDevices / 2,        CL_DEVICE_PARTITION_BY_COUNTS_LIST_END,        0    };    // Create sub-devices    status = clCreateSubDevices(cpuDevice, partitionPrty, numSubDevices, subDevices,                                NULL);    CHECK_OPENCL_ERROR( status, "clCreateSubDevices failed.");    return SDK_SUCCESS;}

这个函数就是设备拆分的关键之一了，前面没什么好讲的就是获取平台信息然后取一个CPU设备以及创建Context等。之后就是根据需要创建的子设备数分配足够的subDivices和subKernel空间。然后通过clGetDeviceInfo首先通过CL_DEVICE_PARTITION_MAX_SUB_DEVICES查找到maxSubDevices参数并判断是否可以拆分。获取之后就可以进行设备的创建了。就是将CPU平均分成两份。那个cl_device_partition_property格式挺别致的，注意下写法。之后就是用clCreateSubDevices创建子设备列表并存储在subDevices中。为了程序的完整，这里将setupRuntime也贴一下吧：

intDeviceFission::setupCLRuntime(){    cl_int status = CL_SUCCESS;    // Create a CL program using the kernel source    buildProgramData buildData;    buildData.kernelName = std::string("DeviceFission_Kernels.cl");    buildData.devices = Devices;    buildData.deviceId = sampleArgs->deviceId;    buildData.flagsStr = std::string("");    if(sampleArgs->isLoadBinaryEnabled())    {        buildData.binaryName = std::string(sampleArgs->loadBinary.c_str());    }    if(sampleArgs->isComplierFlagsSpecified())    {        buildData.flagsFileName = std::string(sampleArgs->flags.c_str());    }    // Get allocate memory for subCmdQueue    subCmdQueue = (cl_command_queue*)malloc(numSubDevices * sizeof(            cl_command_queue));    CHECK_ALLOCATION(subCmdQueue,"Failed to allocate memory. (subCmdQueue)");    // Create command queue subCmdQueue    for(cl_uint i = 0; i < numSubDevices; i++)    {        // Create command queue        subCmdQueue[i] = clCreateCommandQueue(rContext,                                              subDevices[i],                                              0,                                              &status);        CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed. (subCmdQueue)");    }    // Create memory objects for input    InBuf = clCreateBuffer(rContext,                           CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR,                           length * sizeof(cl_int),                           NULL,                           &status);    CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (InBuf)");    // Get allocate memory for sub devices output    subOutBuf = (cl_mem*)malloc(numSubDevices * sizeof(cl_mem));    for(cl_uint i = 0; i < numSubDevices; i++)    {        // Create memory objects for sub devices output        subOutBuf[i] = clCreateBuffer(rContext,                                      CL_MEM_WRITE_ONLY,                                      half_length * sizeof(cl_int) ,                                      NULL,                                      &status);        CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (subOutBuf)");    }    SDKFile kernelFile;    std::string kernelPath = getPath();    char * source = NULL;    size_t sourceSize[] = {0};    char * binary = NULL;    size_t binarySize = 0;    if(sampleArgs->isLoadBinaryEnabled())    {        kernelPath += sampleArgs->loadBinary;        if(kernelFile.readBinaryFromFile(kernelPath.c_str()))        {            std::cout << "Failed to load kernel file : " << kernelPath << std::endl;            return SDK_FAILURE;        }        // Get binaries and binary sizes for CPU devices        char** subBinaries = (char**)malloc(numSubDevices * sizeof(char*));        if(subBinaries == NULL)        {            error("Failed to allocate memory(subBinaries)");            return SDK_FAILURE;        }        size_t* subBinariesSize = (size_t*)malloc(numSubDevices * sizeof(size_t*));        if(subBinariesSize == NULL)        {            error("Failed to allocate memory(subBinariesSize)");            return SDK_FAILURE;        }        for(cl_uint i = 0; i < numSubDevices; ++i)        {            subBinaries[i] = (char*)kernelFile.source().c_str();            subBinariesSize[i] = kernelFile.source().size();        }        subProgram = clCreateProgramWithBinary(rContext,                                               numSubDevices,                                               subDevices,                                               (const size_t *)subBinariesSize,                                               (const unsigned char**)subBinaries,                                               NULL,                                               &status);        CHECK_OPENCL_ERROR(status, "clCreateProgramWithBinary failed.(subProgram)");        free(subBinaries);        free(subBinariesSize);        subBinariesSize = NULL;        subBinaries = NULL;    }    else    {        kernelPath.append("DeviceFission_Kernels.cl");        if(!kernelFile.open(kernelPath.c_str()))//bool        {            std::cout << "Failed to load kernel file: " << kernelPath << std::endl;            return SDK_FAILURE;        }        const char * source = kernelFile.source().c_str();        size_t sourceSize[] = {strlen(source)};        // Create a CL program for sub-devices using the kernel source        subProgram = clCreateProgramWithSource(rContext,                                               1,                                               (const char**)&source,                                               sourceSize,                                               &status);        CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(subProgram)");        // Create a CL program for GPU device using the kernel source        gpuProgram = clCreateProgramWithSource(rContext,                                               1,                                               (const char**)&source,                                               sourceSize,                                               &status);        CHECK_OPENCL_ERROR(status, "clCreateProgramWithSource failed.(gpuProgram)");    }    // Get build options    const char *flags;    SDKFile flagsFile;    std::string flagsPath = getPath();    if(buildData.flagsFileName.size() != 0)    {        flagsPath.append(buildData.flagsFileName.c_str());        if(!flagsFile.open(flagsPath.c_str()))        {            std::cout << "Failed to load flags file: " << flagsPath << std::endl;            return SDK_FAILURE;        }        flagsFile.replaceNewlineWithSpaces();        flags = flagsFile.source().c_str();        if(strlen(flags) != 0)        {            std::cout << "Build Options are : " << flags << std::endl;        }    }    else    {        flags = NULL;    }    // Create a cl program executable for all sub-devices    status = clBuildProgram(subProgram,                            numSubDevices,                            subDevices,                            flags,                            NULL,                            NULL);    CHECK_OPENCL_ERROR(status, "clBuildProgram failed.(subProgram)");    if(status != CL_SUCCESS)    {        if(status == CL_BUILD_PROGRAM_FAILURE)        {            cl_int logStatus;            char * buildLog = NULL;            size_t buildLogSize = 0;            logStatus = clGetProgramBuildInfo(subProgram,                                              subDevices[0],                                              CL_PROGRAM_BUILD_LOG,                                              buildLogSize,                                              buildLog,                                              &buildLogSize);            if(!checkVal(logStatus,                         CL_SUCCESS,                         "clGetProgramBuildInfo failed."))            {                return SDK_FAILURE;            }            buildLog = (char*)malloc(buildLogSize);            if(NULL == buildLog)            {                error("Failed to allocate host memory.(buildLog)");                return SDK_FAILURE;            }            memset(buildLog, 0, buildLogSize);            logStatus = clGetProgramBuildInfo(subProgram,                                              subDevices[0],                                              CL_PROGRAM_BUILD_LOG,                                              buildLogSize,                                              buildLog,                                              NULL);            if(!checkVal(logStatus,                         CL_SUCCESS,                         "clGetProgramBuildInfo failed."))            {                free(buildLog);                return SDK_FAILURE;            }            std::cout << " \n\t\t\tBUILD LOG(SUB-DEVICES)\n";            std::cout << " ************************************************\n";            std::cout << buildLog << std::endl;            std::cout << " ************************************************\n";            free(buildLog);        }        if(!checkVal(status,                     CL_SUCCESS,                     "clBuildProgram failed. (SUB-DEVICES)"))        {            return SDK_FAILURE;        }    }    // Get a kernel object handle for a kernel with the given name    subKernel[0] = clCreateKernel(subProgram,                                  "Add",                                  &status);    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[0])");    // Get a kernel object handle for a kernel with the given name    subKernel[1] = clCreateKernel(subProgram,                                  "Sub",                                  &status);    CHECK_OPENCL_ERROR(status, "clCreateKernel failed.(subKernel[1])");    return SDK_SUCCESS;}

这里可以看到对于每一个子设备都创建了对应的CommandQueue,并且创建了两个内核，一个对应于Add操作，一个对应于Sub操作。最后看下执行函数：

intDeviceFission::runCLALLKerenls(){    cl_int status;    cl_event writeEvent;    cl_event rangeEvent[2];    // Set global and local work items    size_t globalThreads[] = {half_length};    size_t localThreads[] = {groupSize};    // Enqueue write Buffer to the first sub device queue    status = clEnqueueWriteBuffer(subCmdQueue[0],                                  InBuf,                                  CL_FALSE,                                  0,                                  half_length* sizeof(cl_int),                                  input,                                  0,                                  NULL,                                  &writeEvent);    CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed");    cl_uint rangeEventNum = 0;    rangeEvent[0] = rangeEvent[1] = writeEvent;    rangeEventNum++;    for(cl_uint i = 0; i < numSubDevices; ++i)    {        // Set subOutBuf as second argument        status = clSetKernelArg(subKernel[i], 1, sizeof(cl_mem), (void*)&subOutBuf[i]);        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (subOutBuf)");        // Set InBuf as first argument        status = clSetKernelArg(subKernel[i], 0, sizeof(cl_mem),(void*)&InBuf);        CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (InBuf)");        // Enqueue kernel        status = clEnqueueNDRangeKernel(subCmdQueue[i],                                        subKernel[i],                                        1,                                        NULL,                                        globalThreads,                                        localThreads,                                        rangeEventNum,                                        &rangeEvent[i],                                        NULL);        CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed.(subCmdQueue)");        // Enqueue readBuffer        status = clEnqueueReadBuffer(subCmdQueue[i],                                     subOutBuf[i],                                     CL_FALSE,                                     0,                                     half_length * sizeof(cl_int),                                     subOutput + half_length * i,                                     0,                                     NULL,                                     NULL);        CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed. (subCmdQueue)");    }    // Flush all queues together    for(cl_uint i = 0; i < numSubDevices; ++i)    {        status = clFlush(subCmdQueue[i]);        CHECK_OPENCL_ERROR(status, "clFlush failed. (subCmdQueue)");    }    // Finish all queues    status = clFinish(subCmdQueue[0]);    CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[0])");    status = clFinish(subCmdQueue[1]);    CHECK_OPENCL_ERROR(status, "clFinish failed. (subCmdQueue[1])");    status = clReleaseEvent(writeEvent);    CHECK_OPENCL_ERROR(status, "clReleaseEvent failed. (writeEvent)");    return SDK_SUCCESS;}

可以看到InBuf的大小为subOutBuf大小的一半。需要注意的是clEnqueueNDRangeKernel是需要等待clEnqueueWriteBuffer写完输入数据才会开始进行计算。

0 0