OpenCL做并行滤波
来源:互联网 发布:云计算运维是干什么的 编辑:程序博客网 时间:2024/06/07 09:28
本实验主要进行OpenCL一维信号的滤波;主要思路是以离散信号的序列点作为目标,一个工作项负责一个信号点的计算;这样做的好处是方便,相对于串行实现获得相当大的性能提升;但是每个工作项负载不均衡。
host.c
#include<stdio.h>#include<windows.h>#include<math.h>#include<CL/cl.h>#pragma warning( disable : 4996 )#define MIXSIZE 8192*65int main() { cl_int error; cl_platform_id platforms; cl_device_id devices; cl_context context; FILE *program_handle; size_t program_size; char *program_buffer; cl_program program; size_t log_size; char *program_log; char kernel_name[] = "createBuffer"; cl_kernel kernel; cl_command_queue queue; //获取平台 error = clGetPlatformIDs(1, &platforms, NULL); if (error != 0) { printf("Get platform failed!"); return -1; } error = clGetDeviceIDs(platforms, CL_DEVICE_TYPE_GPU, 1, &devices, NULL); if (error != 0) { printf("Get device failed!"); return -1; } //创建上下文 context = clCreateContext(NULL,1,&devices,NULL,NULL,&error); if (error != 0) { printf("Creat context failed!"); return -1; } //创建程序 program_handle = fopen("kernel.cl","rb"); if (program_handle == NULL) { printf("The kernle can not be opened!"); return -1; } fseek(program_handle,0,SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char *)malloc(program_size+1); program_buffer[program_size] = '\0'; error=fread(program_buffer,sizeof(char),program_size,program_handle); if (error == 0) { printf("Read kernel failed!"); return -1; } fclose(program_handle); program = clCreateProgramWithSource(context,1,(const char **)&program_buffer,&program_size,&error); if (error < 0) { printf("Couldn't create the program!"); return -1; } //编译程序 error = clBuildProgram(program,1,&devices,NULL,NULL,NULL); if (error < 0) { //确定日志文件的大小 clGetProgramBuildInfo(program,devices,CL_PROGRAM_BUILD_LOG,0,NULL,&log_size); program_log = (char *)malloc(log_size+1); program_log[log_size] = '\0'; //读取日志 clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n",program_log); free(program_log); getchar(); return -1; } //创建命令队列 queue = clCreateCommandQueue(context, devices, CL_QUEUE_PROFILING_ENABLE, &error); if (error < 0) { printf("Coudn't create the command queue"); return -1; } //创建内核 kernel = clCreateKernel(program,kernel_name,&error); if (kernel==NULL) { printf("Couldn't create kernel!\n"); return -1; } //创建缓存对象 cl_mem memObject1 = clCreateBuffer(context,CL_MEM_READ_ONLY , sizeof(float) * 256,NULL,&error); if (error < 0) { printf("Creat memObject1 failed!\n"); return -1; } cl_mem memObject2 = clCreateBuffer(context, CL_MEM_READ_ONLY , sizeof(float) * MIXSIZE / 65, NULL, &error); if (error < 0) { printf("Creat memObject2 failed!\n"); return -1; } cl_mem memObject3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY , sizeof(float) * MIXSIZE/65, NULL, &error); if (error < 0) { printf("Creat memObject3 failed!\n"); return -1; } //设置内核参数 error = clSetKernelArg(kernel,0,sizeof(cl_mem),&memObject1); error |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObject2); error |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObject3); if (error != CL_SUCCESS) { printf("Error setting kernel arguments!\n"); return -1; } //初始化参数 float* input1 = (float *)malloc(sizeof(float)* 256); float* input2 = (float *)malloc(sizeof(float)* MIXSIZE / 65); float* result = (float *)malloc(sizeof(float)* MIXSIZE / 65); float *check = (float *)malloc(sizeof(float) * MIXSIZE / 65); memset(check, 0, sizeof(float) * MIXSIZE / 65); memset(input1, 1, sizeof(float) * 256); memset(input2, 0, sizeof(float) * MIXSIZE / 65); memset(result, 0, sizeof(float) * MIXSIZE / 65); cl_event evt1; cl_event evt2; cl_event evt3; float* tmp1 = (float *)malloc(sizeof(float)* MIXSIZE); float *tmp2 = (float *)malloc(sizeof(float) * MIXSIZE / 65); memset(tmp2, 0, sizeof(float) * MIXSIZE / 65); //数据读入 //采用随机数函数产生输入 //input2是65*8192 srand(1); for (int j = 0; j < 8192; j++) { input2[ j] = 20 * rand() / (double)(RAND_MAX); // input2[j] = 1; check[j] = 0; } for (int j = 0; j < 256; j++) { input1[j] = 1; } //检查运算结果 for (int j = 0; j < 8192; j++) { if (j > 255) { for (int k = 0; k < 256; k++) { check[j] += input2[j - k] * input1[k]; } } else { for (int k = 0; k < j+1; k++) { check[j] += input2[j-k] * input1[k]; } } } //数据写入内存 error = clEnqueueWriteBuffer(queue, memObject1, CL_FALSE, 0, 256 * sizeof(float), input1, 0, NULL, &evt1); if (error != CL_SUCCESS) { printf("write data failed!\n"); return -1; } error = clEnqueueWriteBuffer(queue, memObject2, CL_FALSE, 0, MIXSIZE * sizeof(float) / 65, input2, 1, &evt1, &evt2); if (error != CL_SUCCESS) { printf("write data failed!\n"); return -1; } //配置工作项 size_t maxWorkGroupSize = 0; clGetDeviceInfo(devices, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(maxWorkGroupSize), &maxWorkGroupSize, NULL); size_t globalWorkSize = 8192; size_t localWorkSize = maxWorkGroupSize; //执行内核 error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalWorkSize, &localWorkSize, 1, &evt2, &evt3); if (error != CL_SUCCESS) { printf("Error queuing kernel for execution!\n"); return -1; } //读取执行结果 error = clEnqueueReadBuffer(queue,memObject3,CL_TRUE,0, MIXSIZE*sizeof(float)/65,result,1,&evt3,NULL); if (error != CL_SUCCESS) { printf("Error reading result buffer!\n"); return -1; } //显示结果 for (int i = 0; i < MIXSIZE/65; i++) { if ((result[i] /check[i]<0.999) | check[i]==0) { printf("failed!\n"); printf("%f,%f,%d\n",result[i],check[i],i); getchar(); return 0; } } printf("successed!\n"); clReleaseEvent(evt1); clReleaseEvent(evt2); clReleaseEvent(evt3); clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); clReleaseDevice(devices); clReleaseKernel(kernel); getchar(); return 0;}
kernel.cl
//卷积//假设有8192个数据//全局工作项8192//卷积系数256//输入1为卷积系数//输入2为数据__kernel void createBuffer(__global float *input1, __global float *input2, __global float *result) { int gid = get_global_id(0); if (gid > 255) { for (int i = 0; i < 256; i++) { result[gid] += input1[i] * input2[gid-i]; } } else { for (int i = 0; i < gid+1; i++) { result[gid] += input1[i] * input2[gid - i]; } }}
这篇博客中kernel函数是将工作项分为两部分,索引大于255和小于255的,因为这两部分运算量不同。
在上一篇关于OpenCL实现序列卷积的博客中提到了一种方式;但是没有考虑到工作项之间的同步问题;运算结果有误;
下一篇博客中会对两种方式进行比较;总结工作项之间的同步问题。以前的博客中总结过主机端命令同步问题,这次完成对工作项之间的同步;OpenCL中同步的基础知识就差不多了。
0 0
- OpenCL做并行滤波
- OpenCL使用GPU滤波
- OpenCL使用GPU滤波
- OpenCL最小线程并行粒度
- OpenCL异构并行计算
- opencl基础概念之并行
- openCl-work-item的并行的理解
- OpenCL与异构并行计算
- OpenCL与异构并行计算
- 并行运算框架OpenCL的一些基本概念
- 利用OpenCL对OpenCV并行化心得(1)
- 利用OpenCL对OpenCV并行化心得(2)
- 利用OpenCL对OpenCV并行化心得(3)
- 使用 OpenCL.Net 进行 C# GPU 并行编程
- OPENCL如何获取获取最小线程并行粒度
- Android平台利用OpenCL框架实现并行开发初试
- OpenCL学习笔记(二):并行编程概念理解
- 【并行计算-CUDA开发】Windows下opencl环境配置
- 关于AOP的几点思考
- wampserver下环境下配置虚拟域名
- 【ios开发技术研究】Xcode8 及iOS10适配问题整理汇总
- 断点续传【TODO】
- Hadoop namenode重新格式化需注意问题
- OpenCL做并行滤波
- ESP传输模式拆解包流程
- iOS开发之UIView与CALayer的异同
- 数霉派上安装Archlinux
- Upgrades ubuntu14.0 升级到16.0 的命令行
- 【扩展欧几里德算法】
- git代理设置 加速git clone
- Linux基础命令详解 持续更新中
- 北大poj ACM试题分类