图像均值滤波的CUDA并行化优化
来源:互联网 发布:程序员奇葩面试题 编辑:程序博客网 时间:2024/05/17 22:56
1、算法原理
均值滤波也是线性滤波,目标点的像素为周围(模板覆盖)像素的平均值。对图像进行均值滤波处理时,每一个像素点的处理与其它像素点无关,所以,可以把对于每一个像素的处理映射到每个线程中,从而实现并行化。
2、并行思路
将像素映射到二维坐标空间,然后使用i*WIDTH+j的方式索引像素,本实例是寻找周围9个像素点的值。对于每一个线程都将去寻找它的领域像素,然后对其求平均值。为了简化计算,边界采用复制像素的方式处理。代码中采用的block Size为256*1。
3、CPU实现代码
对于C++读取图片的方式代码如下,这里不再细讲,需要深究的可以取网上查阅相关博客。
readImage.h
#pragma oncetypedef unsigned char BYTE;typedef unsigned short WORD;typedef unsigned int DWORD;typedef long LONG;//位图文件头定义; //其中不包含文件类型信息(由于结构体的内存结构决定, //要是加了的话将不能正确读取文件信息) typedef struct tagBITMAPFILEHEADER {//WORD bfType;//文件类型,必须是0x424D,即字符“BM” DWORD bfSize;//文件大小 WORD bfReserved1;//保留字 WORD bfReserved2;//保留字 DWORD bfOffBits;//从文件头到实际位图数据的偏移字节数 }BITMAPFILEHEADER;typedef struct tagBITMAPINFOHEADER {DWORD biSize;//信息头大小 LONG biWidth;//图像宽度 LONG biHeight;//图像高度 WORD biPlanes;//位平面数,必须为1 WORD biBitCount;//每像素位数 DWORD biCompression; //压缩类型 DWORD biSizeImage; //压缩图像大小字节数 LONG biXPelsPerMeter; //水平分辨率 LONG biYPelsPerMeter; //垂直分辨率 DWORD biClrUsed; //位图实际用到的色彩数 DWORD biClrImportant; //本位图中重要的色彩数 }BITMAPINFOHEADER; //位图信息头定义 typedef struct tagRGBQUAD {BYTE rgbBlue; //该颜色的蓝色分量 BYTE rgbGreen; //该颜色的绿色分量 BYTE rgbRed; //该颜色的红色分量 BYTE rgbReserved; //保留值 }RGBQUAD;//调色板定义 //像素信息 typedef struct tagIMAGEDATA{BYTE blue;}IMAGEDATA;unsigned char* readImageData(const char* path, int& width00, int& height11);void saveImageData(const char* path, int width, int height, unsigned char* imagedata);void showBmpHead();void showBmpInforHead();readImage.cpp
#include <stdio.h> #include "readImage.h" #include "stdlib.h" #include "math.h" #include <iostream> #define PI 3.14159//圆周率宏定义 #define LENGTH_NAME_BMP 30//bmp图片文件名的最大长度 using namespace std;//变量定义 BITMAPFILEHEADER strHead;RGBQUAD strPla[256];//256色调色板 BITMAPINFOHEADER strInfo;//显示位图文件头信息 void showBmpHead() {cout << "位图文件头:" << endl;cout << "文件大小:" << strHead.bfSize << endl;cout << "保留字_1:" << strHead.bfReserved1 << endl;cout << "保留字_2:" << strHead.bfReserved2 << endl;cout << "实际位图数据的偏移字节数:" << strHead.bfOffBits << endl << endl;}void showBmpInforHead() {cout << "位图信息头:" << endl;cout << "结构体的长度:" << strInfo.biSize << endl;cout << "位图宽:" << strInfo.biWidth << endl;cout << "位图高:" << strInfo.biHeight << endl;cout << "biPlanes平面数:" << strInfo.biPlanes << endl;cout << "biBitCount采用颜色位数:" << strInfo.biBitCount << endl;cout << "压缩方式:" << strInfo.biCompression << endl;cout << "biSizeImage实际位图数据占用的字节数:" << strInfo.biSizeImage << endl;cout << "X方向分辨率:" << strInfo.biXPelsPerMeter << endl;cout << "Y方向分辨率:" << strInfo.biYPelsPerMeter << endl;cout << "使用的颜色数:" << strInfo.biClrUsed << endl;cout << "重要颜色数:" << strInfo.biClrImportant << endl;}unsigned char* readImageData(const char* path, int& width, int& height) {unsigned char* imagedata = NULL;//动态分配存储原图片的像素信息的二维数组 FILE *fpi;fpi = fopen(path, "rb");if (!fpi) {cout << "file open error!" << endl;return NULL;}else{//先读取文件类型 WORD bfType;fread(&bfType, 1, sizeof(WORD), fpi);if (0x4d42 != bfType){cout << "the file is not a bmp file!" << endl;return NULL;}//读取bmp文件的文件头和信息头 fread(&strHead, sizeof(tagBITMAPFILEHEADER), 1, fpi);fread(&strInfo, sizeof(tagBITMAPINFOHEADER), 1, fpi);//showBmpInforHead(strInfo);//显示文件信息头 //读取调色板 for (unsigned int nCounti = 0; nCounti<strInfo.biClrUsed; nCounti++){fread((char *)&(strPla[nCounti].rgbBlue), sizeof(BYTE), 1, fpi);fread((char *)&(strPla[nCounti].rgbGreen), sizeof(BYTE), 1, fpi);fread((char *)&(strPla[nCounti].rgbRed), sizeof(BYTE), 1, fpi);fread((char *)&(strPla[nCounti].rgbReserved), sizeof(BYTE), 1, fpi);}width = strInfo.biWidth;height = strInfo.biHeight;//图像每一行的字节数必须是4的整数倍 width = (width * sizeof(unsigned char) + 3) / 4 * 4;imagedata = (unsigned char*)malloc(width * height * sizeof(unsigned char));//初始化原始图片的像素数组 for (int i = 0; i < height; ++i){for (int j = 0; j < width; ++j){(*(imagedata + i * width + j)) = 0;}}//读出图片的像素数据 fread(imagedata, sizeof(unsigned char) * width, height, fpi);fclose(fpi);return imagedata;}}void saveImageData(const char* path, int width, int height, unsigned char* imagedata) {FILE *fpw;//保存bmp图片 if ((fpw = fopen(path, "wb")) == NULL){cout << "create the bmp file error!" << endl;return;}WORD bfType_w = 0x4d42;fwrite(&bfType_w, sizeof(WORD), 1, fpw);//保存文件头fwrite(&strHead, sizeof(tagBITMAPFILEHEADER), 1, fpw);strInfo.biWidth = width;strInfo.biHeight = height;fwrite(&strInfo, sizeof(tagBITMAPINFOHEADER), 1, fpw);//保存调色板数据 for (unsigned int nCounti = 0; nCounti<strInfo.biClrUsed; nCounti++){fwrite(&strPla[nCounti].rgbBlue, sizeof(BYTE), 1, fpw);fwrite(&strPla[nCounti].rgbGreen, sizeof(BYTE), 1, fpw);fwrite(&strPla[nCounti].rgbRed, sizeof(BYTE), 1, fpw);fwrite(&strPla[nCounti].rgbReserved, sizeof(BYTE), 1, fpw);}//保存像素数据 for (int i = 0; i < height; ++i){for (int j = 0; j < width; ++j){fwrite(&((*(imagedata + i * width + j))), 1, sizeof(unsigned char), fpw);}}fclose(fpw);}
CPU 实现均值滤波代码:
kernel.cu
extern "C" void cpuSmoothImage(unsigned char* srcData, unsigned char* dstData, int width, int height) {//复制源图像数据memcpy(dstData, srcData, width*height * sizeof(unsigned char));cudaEvent_t d_begin, d_end;cudaEventCreate(&d_begin);cudaEventCreate(&d_end);cudaEventRecord(d_begin, 0);for (int i = 1; i < height - 1; i++) {for (int j = 1; j < width - 1; j++) {float temp = 0;temp += srcData[i*width + j - 1];temp += srcData[i*width + j];temp += srcData[i*width + j + 1];temp += srcData[(i + 1)*width + j - 1];temp += srcData[(i + 1)*width + j];temp += srcData[(i + 1)*width + j + 1];temp += srcData[(i - 1)*width + j - 1];temp += srcData[(i - 1)*width + j];temp += srcData[(i - 1)*width + j + 1];temp = temp / 9;dstData[i*width + j] = temp;}}cudaEventRecord(d_end);cudaEventSynchronize(d_end);float cpuTime = 0.0;cudaEventElapsedTime(&cpuTime, d_begin, d_end);printf(">>>CPU Time is : %f ms\n", cpuTime);}
4、GPU全局内存实现
kernel.cu
__global__ void kernelGPU(int width, int height, unsigned char* srcData, unsigned char* dstData){int i = threadIdx.x + blockDim.x * blockIdx.x;int j = blockIdx.y;int pos = j*width + i;//边缘保留if (i > 0 && (i < width - 1) && j > 0 && (j < height - 1)) {float temp = 0;temp += srcData[pos];temp += srcData[pos+1];temp += srcData[pos-1];temp += srcData[pos - width - 1];temp += srcData[pos - width];temp += srcData[pos - width + 1];temp += srcData[pos + width - 1];temp += srcData[pos + width];temp += srcData[pos + width + 1];temp = temp / 9;dstData[pos] = temp;}else {dstData[pos] = srcData[pos];}}extern "C" void gpuSmoothImage(int width, int height, unsigned char* srcData, unsigned char* dstData) {size_t size = width * height * sizeof(unsigned char);cudaEvent_t d_begin, d_end;cudaEventCreate(&d_begin);cudaEventCreate(&d_end);cudaEventRecord(d_begin, 0);unsigned char* d_srcData=NULL;cudaMalloc((void**)&d_srcData, size);cudaMemcpy(d_srcData, srcData, size, cudaMemcpyHostToDevice);unsigned char* d_dstData = NULL;cudaMalloc((void**)&d_dstData, size);//dim3 blockSize(16,16);//dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);dim3 blockSize(256,1);dim3 gridSize((width + blockSize.x - 1) / blockSize.x,height);kernelGPU << <gridSize,blockSize >> > (width, height, d_srcData, d_dstData);cudaMemcpy(dstData, d_dstData, size, cudaMemcpyDeviceToHost);cudaEventRecord(d_end);cudaEventSynchronize(d_end);float gpuTime = 0.0;cudaEventElapsedTime(&gpuTime, d_begin, d_end);printf(">>>GPU Time is : %f ms\n", gpuTime);cudaFree(d_srcData);cudaFree(d_dstData);}
调用方式 main.cpp
#include <stdio.h>#include <stdlib.h>#include <string.h>#include <time.h>#include "readImage.h"extern "C" void gpuSmoothImage(int width, int height, unsigned char* srcData, unsigned char* dstData);extern "C" void cpuSmoothImage(unsigned char* srcData, unsigned char* dstData, int width, int height);extern "C" void gpuSmoothImageTexture(int width, int height, unsigned char* srcData, unsigned char* dstData);int main() {const char* path = "lena.bmp";int Width, Height;unsigned char* readData = readImageData(path, Width, Height);printf("w:%d h:%d\n", Width, Height);//showBmpHead();//showBmpInforHead();//CPU smooth Imageclock_t t1, t2;unsigned char* moothData = NULL;moothData = (unsigned char*)malloc(Width*Height * sizeof(unsigned char));cpuSmoothImage(readData, moothData, Width, Height);const char * savePath = "saveCPU.bmp";saveImageData(savePath, Width, Height, moothData);/*for (int i = 0; i < 100; i++) {for (int j = 0; j < 10; j++) {printf("%5d", moothData[i*Width + j]);}printf("\n");}*///GPU smooth Imageunsigned char* result = NULL;result = (unsigned char*)malloc(Width*Height * sizeof(unsigned char));const char * savePath1 = "saveGPU.bmp";gpuSmoothImage(Width, Height, readData, result);//gpuSmoothImageTexture(Width, Height, readData, result);saveImageData(savePath1, Width, Height, result);/*printf("GPUdata:\n");for (int i = 0; i < 100; i++) {for (int j = 0; j < 10; j++) {printf("%5d", result[i*Width + j]);}printf("\n");}*/return 0;}实验结果:
可以看到GPU和CPU的平滑结果完全一样,我们再看下并行加速的效果,图片在1024的时候加速了14.67倍,效果还是很好的。
5、纹理内存
因为我们是按照行存储的方式读取数据,所以使用一维纹理内存。对于纹理内存主要是三个步骤:纹理参考声明,纹理数据绑定,纹理拾取。
可以参考本人以前博客:http://blog.csdn.net/zhangfuliang123/article/details/76571498
代码如下:
texture<unsigned char, 1, cudaReadModeElementType> texRef;__global__ void kernelGPUTexture(int width, int height, unsigned char* dstData){int i = threadIdx.x + blockDim.x * blockIdx.x;int j = blockIdx.y;int pos = j*width + i;//边缘保留if (i > 0 && (i < width - 1) && j > 0 && (j < height - 1)) {float temp = 0;temp += tex1Dfetch(texRef, pos);temp += tex1Dfetch(texRef, pos + 1);temp += tex1Dfetch(texRef, pos - 1);temp += tex1Dfetch(texRef, pos - width - 1);temp += tex1Dfetch(texRef, pos - width);temp += tex1Dfetch(texRef, pos - width + 1);temp += tex1Dfetch(texRef, pos + width - 1);temp += tex1Dfetch(texRef, pos + width);temp += tex1Dfetch(texRef, pos + width + 1);temp = temp / 9;dstData[pos] = temp;}else {dstData[pos] = tex1Dfetch(texRef, pos);}}extern "C" void gpuSmoothImageTexture(int width, int height, unsigned char* srcData, unsigned char* dstData) {size_t size = width * height * sizeof(unsigned char);cudaEvent_t d_begin, d_end;cudaEventCreate(&d_begin);cudaEventCreate(&d_end);cudaEventRecord(d_begin, 0);unsigned char* d_srcData = NULL;cudaMalloc((void**)&d_srcData, size);cudaMemcpy(d_srcData, srcData, size, cudaMemcpyHostToDevice);unsigned char* d_dstData = NULL;cudaMalloc((void**)&d_dstData, size);//bind texturecudaBindTexture(0, texRef, d_srcData);//dim3 blockSize(16, 16);//dim3 gridSize((width + blockSize.x - 1) / blockSize.x, (height + blockSize.y - 1) / blockSize.y);dim3 blockSize(256,1);dim3 gridSize((width + blockSize.x - 1) / blockSize.x,height);kernelGPUTexture << <gridSize, blockSize >> > (width, height, d_dstData);cudaMemcpy(dstData, d_dstData, size, cudaMemcpyDeviceToHost);cudaEventRecord(d_end);cudaEventSynchronize(d_end);float gpuTime = 0.0;cudaEventElapsedTime(&gpuTime, d_begin, d_end);printf(">>>GPU Time is : %f ms\n", gpuTime);cudaFree(d_srcData);cudaFree(d_dstData);cudaUnbindTexture(&texRef);}使用纹理内存加速效果如下:
可以看出,使用纹理内存,加速效果并不是很理想,应该是使用一维纹理内存拾取不能达到很好的效果(不是非常确定)。
阅读全文
0 0
- 图像均值滤波的CUDA并行化优化
- 图像的均值滤波、中值滤波_JAVA
- 图像的均值滤波和方框滤波
- 图像的均值滤波(mean filtering)
- 图像处理:均值滤波
- 图像噪声的抑制——均值滤波、中值滤波、对称均值滤波
- 图像噪声的抑制——均值滤波、中值滤波、对称均值滤波
- 图像噪声的抑制——均值滤波、中值滤波、对称均值滤波
- CUDA均值滤波和中值滤波编程
- 图像模糊--快速均值滤波
- 【图像处理】快速均值滤波
- MATLAB 图像 均值滤波 中值滤波
- 均值滤波、中值滤波【MATLAB】【图像处理】
- 发个MatLab 自编的 均值滤波、中值滤波、高斯滤波 图像处理函数
- 发个MatLab 自编的 均值滤波、中值滤波、高斯滤波 图像处理函数
- MatLab 自编的 均值滤波、中值滤波、高斯滤波 图像处理函数
- MatLab 自编的 均值滤波、中值滤波、高斯滤波 图像处理函数(转)
- 发个MatLab 自编的 均值滤波、中值滤波、高斯滤波 图像处理函数
- JavaScript 什么是原型链?
- list_for_each_entry
- TabLayout实现顶部标题栏和底部导航栏,TrustyGridSimpleAdapter实现按日分类图片
- 看完让你彻底搞懂Websocket原理
- [NOIP2014][建图]寻找道路
- 图像均值滤波的CUDA并行化优化
- Java算法-插入排序
- 无聊木鱼的动画效果
- HDU 6138 Fleet of the Eternal Throne
- MySQL触发器使用详解
- centos7 搭建FTP服务
- How to design DL model(3):Understanding LSTM Networks
- set的基本使用方法
- 字符识别(字符串处理)