CUDA卷积操作—使用constant memory实现高斯滤波
来源:互联网 发布:php decrypt 加密 编辑:程序博客网 时间:2024/05/16 23:44
先转载一篇CUDA卷积的实现,对比一下GPU与CPU中的代码有什么不同:
原文地址:CUDA卷积操作—使用constant memory实现高斯滤波
高斯滤波就是使用高斯模板和图片进行卷积运算,高斯函数及模板如下图所示:
卷积前后的效果图如下:
constant memory的使用及CUDA编程的相关内容,在代码注释中有详细介绍。
GPU代码如下所示:
#include "cuda_runtime.h" #include "device_launch_parameters.h" #include <stdio.h> #include "Windows.h" #include <math.h> #include<iostream> using namespace std; #define BLOCKDIM_X 16 #define BLOCKDIM_Y 16 #define GRIDDIM_X 256 #define GRIDDIM_Y 256 #define MASK_WIDTH 5 __constant__ int d_const_Gaussian[MASK_WIDTH*MASK_WIDTH]; //分配常数存储器 unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount); bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount); static __global__ void kernel_GaussianFilt(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_guassian_imgbuf); void main() { //查看显卡配置 struct cudaDeviceProp pror; cudaGetDeviceProperties(&pror,0); cout<<"maxThreadsPerBlock="<<pror.maxThreadsPerBlock<<endl; long start, end; long time = 0; //CUDA计时函数 start = GetTickCount(); cudaEvent_t startt,stop; //CUDA计时机制 cudaEventCreate(&startt); cudaEventCreate(&stop); cudaEventRecord(startt,0); unsigned char *h_src_imgbuf; //图像指针 int width, height, byteCount; char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\"; char readPath[1024]; int frame=1; for (int k=1;k<=frame;k++) { sprintf(readPath, "%s%d.bmp", rootPath1, k); h_src_imgbuf=readBmp(readPath, &width, &height, &byteCount); int size1=width*height *byteCount*sizeof(unsigned char); int size2=width*height *sizeof(unsigned char); //输出图像内存-host端 unsigned char *h_guassian_imgbuf=new unsigned char[width*height*byteCount]; //分配显存空间 unsigned char *d_src_imgbuf; unsigned char *d_guassian_imgbuf; cudaMalloc((void**)&d_src_imgbuf, size1); cudaMalloc((void**)&d_guassian_imgbuf, size1); //把数据从Host传到Device cudaMemcpy(d_src_imgbuf, h_src_imgbuf, size1, cudaMemcpyHostToDevice); //将高斯模板传入constant memory int Gaussian[25] = {1,4,7,4,1, 4,16,26,16,4, 7,26,41,26,7, 4,16,26,16,4, 1,4,7,4,1};//总和为273 cudaMemcpyToSymbol(d_const_Gaussian, Gaussian, 25 * sizeof(int)); int bx = ceil((double)width/BLOCKDIM_X); //网格和块的分配 int by = ceil((double)height/BLOCKDIM_Y); if(bx > GRIDDIM_X) bx = GRIDDIM_X; if(by > GRIDDIM_Y) by = GRIDDIM_Y; dim3 grid(bx, by);//网格的结构 dim3 block(BLOCKDIM_X, BLOCKDIM_Y);//块的结构 //kernel--高斯滤波 kernel_GaussianFilt<<<grid, block>>>(width,height,byteCount,d_src_imgbuf,d_guassian_imgbuf); cudaMemcpy(h_guassian_imgbuf, d_guassian_imgbuf,size1, cudaMemcpyDeviceToHost);//数据传回主机端 char rootPath2[]="C:\\Users\\a404\\Desktop\\测试结果\\"; char writePath[1024]; sprintf(writePath, "%s%d.bmp", rootPath2, k); saveBmp(writePath, h_guassian_imgbuf, width, height, byteCount); //输出进度展示 cout<<k<<" "<<((float)k/frame)*100<<"%"<<endl; //释放内存 cudaFree(d_src_imgbuf); cudaFree(d_guassian_imgbuf); delete []h_src_imgbuf; delete []h_guassian_imgbuf; } end = GetTickCount(); InterlockedExchangeAdd(&time, end - start); cout << "Total time GPU:"; cout << time << endl; int x; cin>>x; } static __global__ void kernel_GaussianFilt(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_dst_imgbuf) { const int tix = blockDim.x * blockIdx.x + threadIdx.x; const int tiy = blockDim.y * blockIdx.y + threadIdx.y; const int threadTotalX = blockDim.x * gridDim.x; const int threadTotalY = blockDim.y * gridDim.y; for(int ix = tix; ix < height; ix += threadTotalX) for(int iy = tiy; iy < width; iy += threadTotalY) { for(int k=0;k<byteCount;k++) { int sum=0;//临时值 int tempPixelValue=0; for (int m=-2;m<=2;m++) { for (int n=-2;n<=2;n++) { //边界处理,幽灵元素赋值为零 if (ix+m<0||iy+n<0||ix+m>=height||iy+n>=width) tempPixelValue=0; else tempPixelValue=*(d_src_imgbuf+(ix+m)*width*byteCount+(iy+n)*byteCount+k); sum+=tempPixelValue*d_const_Gaussian[(m+2)*5+n+2]; } } if (sum/273<0) *(d_dst_imgbuf+(ix)*width*byteCount+(iy)*byteCount+k)=0; else if(sum/273>255) *(d_dst_imgbuf+(ix)*width*byteCount+(iy)*byteCount+k)=255; else *(d_dst_imgbuf+(ix)*width*byteCount+(iy)*byteCount+k)=sum/273; } } } unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount) { //打开文件 FILE *fp=fopen(bmpName,"rb"); if(fp==0) return 0; //跳过文件头 fseek(fp, sizeof(BITMAPFILEHEADER),0); //读入信息头 int w, h, b; BITMAPINFOHEADER head; fread(&head, sizeof(BITMAPINFOHEADER), 1,fp); w = head.biWidth; h = head.biHeight; b = head.biBitCount/8; int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数 //跳过颜色表 (颜色表的大小为1024)(彩色图像并没有颜色表,不需要这一步) if(b==1) fseek(fp, 1024,1); //图像数据 unsigned char *imgBuf=new unsigned char[w * h * b]; for(int i=0;i<h;i++) { fread(imgBuf+i*w*b,w*b, 1,fp); fseek(fp, lineByte-w*b, 1); } fclose(fp); *width=w, *height=h, *byteCount=b; return imgBuf; } bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount) { if(!imgBuf) return 0; //灰度图像颜色表空间1024,彩色图像没有颜色表 int palettesize=0; if(byteCount==1) palettesize=1024; //一行象素字节数为4的倍数 int lineByte=(width * byteCount+3)/4*4; FILE *fp=fopen(bmpName,"wb"); if(fp==0) return 0; //填写文件头 BITMAPFILEHEADER fileHead; fileHead.bfType = 0x4D42; fileHead.bfSize= sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height; fileHead.bfReserved1 = 0; fileHead.bfReserved2 = 0; fileHead.bfOffBits=54+palettesize; fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp); // 填写信息头 BITMAPINFOHEADER head; head.biBitCount=byteCount*8; head.biClrImportant=0; head.biClrUsed=0; head.biCompression=0; head.biHeight=height; head.biPlanes=1; head.biSize=40; head.biSizeImage=lineByte*height; head.biWidth=width; head.biXPelsPerMeter=0; head.biYPelsPerMeter=0; fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp); //颜色表拷贝 if(palettesize==1024) { unsigned char palette[1024]; for(int i=0;i<256;i++) { *(palette+i*4+0)=i; *(palette+i*4+1)=i; *(palette+i*4+2)=i; *(palette+i*4+3)=0; } fwrite(palette, 1024,1, fp); } //准备数据并写文件 unsigned char *buf=new unsigned char[height*lineByte]; for(int i=0;i<height;i++) { for(int j=0;j<width*byteCount; j++) *(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j); } fwrite(buf, height*lineByte, 1, fp); delete []buf; fclose(fp); return 1; }
附录:高斯滤波CPU代码
#include "stdio.h" #include "Windows.h" #include <iostream> using namespace std; unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount); //读入图像 bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount); //保存图像 void GaussianFilt(int width,int height,int byteCount,int Gaussian[][5],unsigned char *gray_imgbuf,unsigned char *guassian_imgbuf); //高斯滤波 void main() { //计时函数 long start, end; long time = 0; start = GetTickCount(); unsigned char *src_imgbuf; //图像指针 int width, height, byteCount; char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\"; char readPath[1024]; int frame=300; //读入图像副数 for (int i=1;i<=frame;i++) { sprintf(readPath, "%s%d.bmp", rootPath1, i); src_imgbuf=readBmp(readPath, &width, &height, &byteCount); //printf("宽=%d,高=%d,字节=%d\n",width, height, byteCount); //读入高斯模糊模板 int Gaussian_mask[5][5]={{1,4,7,4,1},{4,16,26,16,4},{7,26,41,26,7},{4,16,26,16,4},{1,4,7,4,1}};//总和为273 //输出图像内存分配 unsigned char *guassian_imgbuf=new unsigned char[width*height*byteCount]; //对原图高斯模糊 GaussianFilt(width,height,byteCount,Gaussian_mask,src_imgbuf,guassian_imgbuf); char rootPath2[]="C:\\Users\\a404\\Desktop\\"; char writePath[1024]; sprintf(writePath, "%s%d.bmp", rootPath2, i); saveBmp(writePath, guassian_imgbuf, width, height, byteCount); cout<<i<<" "<<((float)i/frame)*100<<"%"<<endl; delete []src_imgbuf; delete []guassian_imgbuf; } end = GetTickCount(); InterlockedExchangeAdd(&time, end - start); cout << "Total time CPU:"; cout << time << endl; int x; cin>>x; } void GaussianFilt(int width,int height,int byteCount,int Gaussian[][5],unsigned char *src_imgbuf,unsigned char *guassian_imgbuf) { //高斯模糊处理 5层循环处理 for(int i=0;i<height;i++) { for(int j=0;j<width;j++) { for(int k=0;k<byteCount;k++) { int sum=0;//临时值 int tempPixelValue=0; for (int m=-2;m<=2;m++) { for (int n=-2;n<=2;n++) { //边界处理,幽灵元素赋值为零 if (i+m<0||j+n<0||i+m>=height||j+n>=width) tempPixelValue=0; else tempPixelValue=*(src_imgbuf+(i+m)*width*byteCount+(j+n)*byteCount+k); //tempPixelValue=*(gray_imgbuf+(i+m)*width+(j+n)+k); sum+=tempPixelValue*Gaussian[m+2][n+2]; } } //tempPixelValue=*(src_imgbuf+(i)*width*byteCount+(j)*byteCount+k); if (sum/273<0) *(guassian_imgbuf+i*width*byteCount+j*byteCount+k)=0; else if(sum/273>255) *(guassian_imgbuf+i*width*byteCount+j*byteCount+k)=255; else *(guassian_imgbuf+i*width*byteCount+j*byteCount+k)=sum/273; } } } } //给定一个图像文件及其路径,读入图像数据。 unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount) { //打开文件, FILE *fp=fopen(bmpName,"rb"); if(fp==0) return 0; //跳过文件头 fseek(fp, sizeof(BITMAPFILEHEADER),0); //读入信息头 int w, h, b; BITMAPINFOHEADER head; fread(&head, sizeof(BITMAPINFOHEADER), 1,fp); w = head.biWidth; h = head.biHeight; b = head.biBitCount/8; int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数 //跳过颜色表 (颜色表的大小为1024)(彩色图像并没有颜色表,不需要这一步) if(b==1) fseek(fp, 1024,1); //图像数据 unsigned char *imgBuf=new unsigned char[w * h * b]; for(int i=0;i<h;i++) { fread(imgBuf+i*w*b,w*b, 1,fp); fseek(fp, lineByte-w*b, 1); } fclose(fp); *width=w, *height=h, *byteCount=b; return imgBuf; } bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount) { if(!imgBuf) return 0; //灰度图像颜色表空间1024,彩色图像没有颜色表 int palettesize=0; if(byteCount==1) palettesize=1024; //一行象素字节数为4的倍数 int lineByte=(width * byteCount+3)/4*4; FILE *fp=fopen(bmpName,"wb"); if(fp==0) return 0; //填写文件头 BITMAPFILEHEADER fileHead; fileHead.bfType = 0x4D42; fileHead.bfSize= sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height; fileHead.bfReserved1 = 0; fileHead.bfReserved2 = 0; fileHead.bfOffBits=54+palettesize; fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp); // 填写信息头 BITMAPINFOHEADER head; head.biBitCount=byteCount*8; head.biClrImportant=0; head.biClrUsed=0; head.biCompression=0; head.biHeight=height; head.biPlanes=1; head.biSize=40; head.biSizeImage=lineByte*height; head.biWidth=width; head.biXPelsPerMeter=0; head.biYPelsPerMeter=0; fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp); //颜色表拷贝 if(palettesize==1024) { unsigned char palette[1024]; for(int i=0;i<256;i++) { *(palette+i*4+0)=i; *(palette+i*4+1)=i; *(palette+i*4+2)=i; *(palette+i*4+3)=0; } fwrite(palette, 1024,1, fp); } //准备数据并写文件 unsigned char *buf=new unsigned char[height*lineByte]; for(int i=0;i<height;i++) { for(int j=0;j<width*byteCount; j++) *(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j); } fwrite(buf, height*lineByte, 1, fp); delete []buf; fclose(fp); return 1; }
阅读全文
0 0
- CUDA卷积操作—使用constant memory实现高斯滤波
- CUDA卷积操作—使用constant memory实现高斯滤波
- Constant Memory in CUDA
- 高斯滤波及高斯卷积核C++实现
- 高斯滤波及高斯卷积核C++实现
- 高斯滤波及高斯卷积核C++实现
- 高斯滤波及高斯卷积核C++实现
- 基础——线性滤波与卷积、高斯滤波
- CUDA实现图像的高斯滤波(opencv实现)
- c#实现图像图像卷积与滤波——高斯平滑
- Matlab中的高斯卷积滤波矩阵
- 影像卷积和滤波运算(高斯滤波模板)
- 影像卷积和滤波运算(高斯滤波模板)
- 影像卷积和滤波运算(高斯滤波模板)
- 图像卷积和滤波运算(高斯滤波模板)
- 影像卷积和滤波运算(高斯滤波模板)
- 图像卷积和滤波处理-高斯滤波
- 图像卷积和滤波运算(高斯滤波模板)
- xpath和css选择元素的不同
- Notification 都不知道,还学什么 Android 应用开发
- 我的第一次博客
- TensorFlow 安装官方教程:Ubuntu 安装,Mac OS X 安装,Windows 安装
- 在C++类中使用pthread实现多线程
- CUDA卷积操作—使用constant memory实现高斯滤波
- HDU2066 最短路spfa 多起点 多终点 基础题
- kruskal算法应用——口袋的天空
- MYSQL学习笔记(十九)使用存储过程
- java递归算法(1加到100&换汽水)
- 正则表达式的使用
- 控制反转(IoC)与依赖注入(DI)
- 非常可乐
- OpenCV之亮度、对比度详解