OpenCl_CPU加速矩阵运算
来源:互联网 发布:淘宝明星店铺 编辑:程序博客网 时间:2024/06/01 08:21
本博文用的是intel的opencl架构,下载链接https://software.intel.com/en-us/intel-opencl/download,默认安装即可
注意:安装完毕后opencl的sdk在路径C:\Program Files (x86)\Intel\OpenCL SDK\6.3下
第一步:检验计算机硬件设备
安装完毕检验硬件设备,查看平台数量,代码如下:
- #include <iostream>
- #include <malloc.h>
- #include <CL/cl.h>//包含CL的头文件
-
- using namespace std;
-
-
- const char* GetDeviceType(cl_device_type it)
- {
- if (it == CL_DEVICE_TYPE_CPU)
- return "CPU";
- else if (it == CL_DEVICE_TYPE_GPU)
- return "GPU";
- else if (it == CL_DEVICE_TYPE_ACCELERATOR)
- return "ACCELERATOR";
- else
- return "DEFAULT";
-
- }
-
- int main()
- {
- char dname[512];
- cl_device_id devices[20];
- cl_platform_id* platform_id = NULL;
- cl_uint num_devices;
- cl_device_type int_type;
- cl_ulong long_entries;
- cl_uint num_platform;
- cl_int err;
-
-
- err = clGetPlatformIDs(0, NULL, &num_platform);
-
- if (err != CL_SUCCESS)
- {
- cout << "clGetPlatformIDs error" << endl;
- return 0;
- }
-
- cout << "PlatForm num:" << num_platform << endl;
-
- int st = 0;
-
- platform_id = new cl_platform_id[num_platform];
-
- err = clGetPlatformIDs(num_platform, platform_id, NULL);
-
- if (err != CL_SUCCESS)
- {
- cout << "clGetPlatformIDs error" << endl;
- return 0;
- }
-
- for (st = 0; st<num_platform; st++)
- {
- cout << "----------------------------------" << endl;
- cout << "Platform " << st + 1 << endl;
-
-
- clGetPlatformInfo(platform_id[st], CL_PLATFORM_NAME, 512, dname, NULL);
- cout << "CL_PLATFORM_NAME:" << dname << endl;
-
-
- clGetPlatformInfo(platform_id[st], CL_PLATFORM_VENDOR, 512, dname, NULL);
- cout << "CL_PLATFORM_VERSION:" << dname << endl;
-
-
- clGetDeviceIDs(platform_id[st], CL_DEVICE_TYPE_ALL, 20, devices, &num_devices);
- cout << "Device num:" << num_devices << endl;
-
- unsigned int n = 0;
-
-
- for (n = 0; n<num_devices; n++)
- {
- cout << endl << "Device " << n + 1 << endl;
-
- clGetDeviceInfo(devices[n], CL_DEVICE_NAME, 512, dname, NULL);
- cout << "Device :" << dname << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_TYPE, sizeof(cl_device_type),
- &int_type, NULL);
- cout << "Device Type:" << GetDeviceType(int_type) << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DRIVER_VERSION, 512, dname, NULL);
- cout << "Device version:" << dname << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_GLOBAL_MEM_SIZE,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device global mem(MB):" <<
- long_entries / 1024 / 1024 << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device global mem cache(KB):" <<
- long_entries / 1024 << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_LOCAL_MEM_SIZE,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device Locale mem(KB) :" << long_entries / 1024 << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_MAX_CLOCK_FREQUENCY,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device Max clock(MHz) :" << long_entries << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_MAX_WORK_GROUP_SIZE,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device Max Group size :" << long_entries << endl;
-
-
- clGetDeviceInfo(devices[n], CL_DEVICE_MAX_COMPUTE_UNITS,
- sizeof(cl_ulong), &long_entries, NULL);
- cout << "Device Max parallel cores:" << long_entries << endl;
-
- }
- }
-
- return 0;
- }
如果发现cpu的opencl版本为实验版本2.1,需要runtime配置文件,安装即可。
配置文件链接:http://pan.baidu.com/s/1geNnMy3 密码:7n8o
再次检验硬件设备,查看平台数量,可以发现cpu的opencl版本正确。
第二步:矩阵运算加速
代码如下:
核函数vecadd.cl文件如下:
- __kernel void vecAdd(__global int* A,
- __global int* B,
- __global int* C)
- {
-
-
- int idx = get_global_id(0);
- C[idx] = A[idx] + B[idx];
- }
主函数main.cpp文件如下:
- #include <iostream>
- #include <stdio.h>
- #include <string.h>
- #include <string>
- #include <vector>
- #include <CL/cl.h>//包含CL的头文件
-
-
-
- #pragma warning( disable : 4996 )
-
- using namespace std;
-
-
- #define elements 100
-
-
- bool GetFileData(const char* fname, string& str)
- {
- FILE* fp = fopen(fname, "r");
- if (fp == NULL)
- {
- printf("no found filen");
- return false;
- }
-
- int n = 0;
- while (feof(fp) == 0)
- {
- str += fgetc(fp);
- }
-
- return true;
- }
-
- int main()
- {
-
-
- string code_file;
-
- if (false == GetFileData("vecadd.cl", code_file))
- {
- return 0;
- }
-
- char* buf_code = new char[code_file.size()];
- strcpy(buf_code, code_file.c_str());
- buf_code[code_file.size() - 1] = NULL;
-
-
- cl_device_id device;
- cl_platform_id *platform_id = NULL;
- cl_context context;
- cl_command_queue cmdQueue;
- cl_mem bufferA, bufferB, bufferC;
- cl_program program;
- cl_kernel kernel = NULL;
-
-
-
- size_t globalWorkSize[1];
- globalWorkSize[0] = elements;
-
- cl_int err;
-
-
- int* buf_A = new int[elements];
- int* buf_B = new int[elements];
- int* buf_C = new int[elements];
-
- size_t datasize = sizeof(int) * elements;
-
- for (int i = 0; i < elements; i++)
- {
- buf_A[i] = (float)i;
- buf_B[i] = (float)i + 1.0;
- }
-
-
- cl_uint num_platform;
- err = clGetPlatformIDs(0, NULL, &num_platform);
-
- platform_id = new cl_platform_id[num_platform];
- err = clGetPlatformIDs(num_platform, platform_id, NULL);
-
- if (err != CL_SUCCESS)
- {
- cout << "clGetPlatformIDs error" << endl;
- return 0;
- }
-
-
- clGetDeviceIDs(platform_id[2], CL_DEVICE_TYPE_CPU, 1, &device, NULL);
-
-
- context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
-
-
- cmdQueue = clCreateCommandQueue(context, device, 0, NULL);
-
-
- bufferA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, NULL);
- bufferB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, NULL);
- bufferC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, NULL);
-
-
- clEnqueueWriteBuffer(cmdQueue, bufferA, CL_FALSE, 0, datasize, buf_A, 0, NULL, NULL);
- clEnqueueWriteBuffer(cmdQueue, bufferB, CL_FALSE, 0, datasize, buf_B, 0, NULL, NULL);
-
-
- program = clCreateProgramWithSource(context, 1, (const char**)&buf_code, NULL, NULL);
- clBuildProgram(program, 1, &device, NULL, NULL, NULL);
- kernel = clCreateKernel(program, "vecAdd", NULL);
-
-
- clSetKernelArg(kernel, 0, sizeof(cl_mem), &bufferA);
- clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufferB);
- clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufferC);
-
- clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL);
-
-
- clEnqueueReadBuffer(cmdQueue, bufferC, CL_TRUE, 0, datasize, buf_C, 0, NULL, NULL);
-
-
- cout << buf_A[0] << "+" << buf_B[0] << "=" << buf_C[0] << endl;
- cout << buf_A[elements - 1] << "+" << buf_B[elements - 1] << "=" << buf_C[elements - 1] << endl;
-
-
- clReleaseKernel(kernel);
- clReleaseProgram(program);
- clReleaseCommandQueue(cmdQueue);
- clReleaseMemObject(bufferA);
- clReleaseMemObject(bufferB);
- clReleaseMemObject(bufferC);
- clReleaseContext(context);
-
- delete[]platform_id;
- delete[]buf_A;
- delete[]buf_B;
- delete[]buf_C;
- delete[]buf_code;
- system("pause");
- return 0;
- }
测试结果显示如下: