CUDA大津阈值—原子加统计直方图&规约求最大值索引
来源:互联网 发布:地牢猎手5淘宝刷钻 编辑:程序博客网 时间:2024/06/05 15:17
在大津阈值的CUDA实现中有三个需要注意的地方:
(1)使用atomicAdd操作实现直方图的统计;
atomicAdd简介:原子操作一般要实现读取-修改-写入三个步骤,并且在执行过程中不会被打断。即除非已经完成了这三个操作,否则其他线程都不能读取或写入x的值。
在计算直方图时,存在一个问题,多个线程可能对输出直方图的同一元素进行递增,这时候我们需要使用原子操作完成。
(2)借助sharedmemory规约求和
(3)使用二维shared memory规约求最大值,下图为二维shared memory的示意图。
下图为大津阈值的前后对比图:
详细介绍见附录代码及注释。
GPU代码如下:
#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include "Windows.h"#include <math.h>#include<iostream>using namespace std;#define BLOCKDIM_X16#define BLOCKDIM_Y16#define GRIDDIM_X256#define GRIDDIM_Y256#define MASK_WIDTH5unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount);bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount);static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf);static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram);static __global__ void kernel_ReductionSum(int *d_a, int* d_sum);static __global__ void kernel_Variance(int *histogram, float* devi, int sum);static __global__ void kernel_ReductionMax(float* devi, int* thresh);void main(){//查看显卡配置struct cudaDeviceProp pror;cudaGetDeviceProperties(&pror,0);cout<<"maxThreadsPerBlock="<<pror.maxThreadsPerBlock<<endl;long start, end;long time = 0; //CUDA计时函数start = GetTickCount();cudaEvent_t startt,stop; //CUDA计时机制cudaEventCreate(&startt);cudaEventCreate(&stop);cudaEventRecord(startt,0);unsigned char *h_src_imgbuf; //图像指针int width, height, byteCount;char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\";char readPath[1024];int frame=300;for (int k=1;k<=frame;k++){sprintf(readPath, "%s%d.bmp", rootPath1, k);h_src_imgbuf=readBmp(readPath, &width, &height, &byteCount);//printf("宽=%d,高=%d,字节=%d\n",width, height, byteCount);int size1=width*height *byteCount*sizeof(unsigned char);int size2=width*height *sizeof(unsigned char);//输出图像内存-host端unsigned char *h_guassian_imgbuf=new unsigned char[width*height*byteCount];unsigned char *h_gray_imgbuf=new unsigned char[width*height];unsigned char *h_Otus_imgbuf=new unsigned char[width*height];//分配显存空间unsigned char *d_src_imgbuf;unsigned char *d_gray_imgbuf;unsigned char *d_Otus_imgbuf;int *d_histogram;float *d_devi;int *d_sum;int *d_thresh;float *d_maxdevi;cudaMalloc((void**)&d_src_imgbuf, size1);cudaMalloc((void**)&d_gray_imgbuf, size2);cudaMalloc((void**)&d_Otus_imgbuf, size2);cudaMalloc((void**)&d_histogram, 256*sizeof(int));cudaMalloc((void**)&d_devi, 256*sizeof(float));cudaMalloc((void**)&d_sum, sizeof(int));cudaMalloc((void**)&d_thresh, sizeof(int));//cudaMalloc((void**)&d_maxdevi, sizeof(float));//把数据从Host传到DevicecudaMemcpy(d_src_imgbuf, h_src_imgbuf, size1, cudaMemcpyHostToDevice);int bx = ceil((double)width/BLOCKDIM_X); //网格和块的分配int by = ceil((double)height/BLOCKDIM_Y);if(bx > GRIDDIM_X) bx = GRIDDIM_X;if(by > GRIDDIM_Y) by = GRIDDIM_Y;dim3 grid(bx, by);//网格的结构dim3 block(BLOCKDIM_X, BLOCKDIM_Y);//块的结构//kernel--灰度变换kernel_color2gray<<<grid, block>>>(width,height,byteCount,d_src_imgbuf,d_gray_imgbuf);cudaMemcpy(h_gray_imgbuf, d_gray_imgbuf,size2, cudaMemcpyDeviceToHost);//数据传回主机端//统计直方图int h_histogram[256]={0};//使用原子操作进行直方图统计cudaMemcpy(d_histogram, h_histogram, 256*sizeof(int), cudaMemcpyHostToDevice);kernel_histogram<<<grid, block>>>(width,height,d_gray_imgbuf,d_histogram);cudaMemcpy(h_histogram, d_histogram, 256*sizeof(int), cudaMemcpyDeviceToHost); //方差及最大方差float h_devi[256]={0}, maxDevi=0; //像素总数int h_sum[1]={0};//规约求和--借助shared memorykernel_ReductionSum<<<1, 256>>>(d_histogram,d_sum);cudaMemcpy(h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);//遍历求方差--启用256个线程cudaMemcpy(d_devi, h_devi, size1, cudaMemcpyHostToDevice);kernel_Variance<<<1, 256>>>(d_histogram,d_devi,h_sum[0]);cudaMemcpy(h_devi, d_devi, 256*sizeof(float), cudaMemcpyDeviceToHost);//最佳阈值threshint h_thresh[1]={0};//规约求最大值,并记录最大值位置--启用256*2个线程kernel_ReductionMax<<<1, 512>>>(d_devi,d_thresh);cudaMemcpy(h_thresh, d_thresh, sizeof(int), cudaMemcpyDeviceToHost);for(int i=0;i<height;i++)for(int j=0;j<width;j++){int value=*(h_gray_imgbuf+i*width+j);if(value<=h_thresh[0])*(h_Otus_imgbuf+i*width+j)=0;else*(h_Otus_imgbuf+i*width+j)=255;}char rootPath2[]="C:\\Users\\a404\\Desktop\\测试结果\\";char writePath[1024];sprintf(writePath, "%s%d.bmp", rootPath2, k);//saveBmp(writePath, h_guassian_imgbuf, width, height, byteCount);saveBmp(writePath, h_Otus_imgbuf, width, height, 1);cout<<k<<" "<<((float)k/frame)*100<<"%"<<endl;//释放内存cudaFree(d_src_imgbuf); cudaFree(d_gray_imgbuf);cudaFree(d_Otus_imgbuf);cudaFree(d_histogram);cudaFree(d_sum);cudaFree(d_thresh);delete []h_src_imgbuf;delete []h_gray_imgbuf;delete []h_Otus_imgbuf;}end = GetTickCount();InterlockedExchangeAdd(&time, end - start);cout << "Total time GPU:";cout << time << endl;int x;cin>>x;}static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf){const int tix = blockDim.x * blockIdx.x + threadIdx.x;const int tiy = blockDim.y * blockIdx.y + threadIdx.y;const int threadTotalX = blockDim.x * gridDim.x;const int threadTotalY = blockDim.y * gridDim.y;for(int ix = tix; ix < height; ix += threadTotalX)for(int iy = tiy; iy < width; iy += threadTotalY)*(d_gray_imgbuf+ix*width+iy)=0.11**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+0)+0.59**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+1)+0.30**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+2)+0.5;}static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram){__shared__ int temp[256];//直方图初始化为0temp[threadIdx.x*16+threadIdx.y] = 0;__syncthreads();const int tix = blockDim.x * blockIdx.x + threadIdx.x;const int tiy = blockDim.y * blockIdx.y + threadIdx.y;const int threadTotalX = blockDim.x * gridDim.x;const int threadTotalY = blockDim.y * gridDim.y;for(int ix = tix; ix < height; ix += threadTotalX){for(int iy = tiy; iy < width; iy += threadTotalY){atomicAdd(&(temp[d_gray_imgbuf[ix*width+iy]]), 1);}}__syncthreads();atomicAdd(&(histogram[threadIdx.x*16+threadIdx.y]), temp[threadIdx.x*16+threadIdx.y]);}static __global__ void kernel_ReductionSum(int *d_a, int* d_sum) { //申请共享内存,存在于每个block中 __shared__ int partialSum[256]; //确定索引 int tid = threadIdx.x; //传global memory数据到shared memory partialSum[tid]=d_a[tid]; //传输同步 __syncthreads(); //在共享存储器中进行规约 for(int stride = 128; stride > 0; stride/=2) { if(tid<stride) partialSum[tid]+=partialSum[tid+stride]; __syncthreads(); } //将当前block的计算结果写回输出数组 if(tid==0) d_sum[0] = partialSum[0]; } static __global__ void kernel_Variance(int *histogram, float* devi, int sum){int tid = threadIdx.x; //c0和c1组的均值float u0,u1; //c0和c1组产生的概率float w0,w1; //c0组的像素数int count0; //阈值t//int t; u0=0;count0=0;for(int i=0; i<=tid;i++){u0 += i*histogram[i];count0 += histogram[i];}u0=u0/count0;w0=(float)count0/sum;//计算阈值为t时,c1组的均值和产生的概率u1=0;for(int i=tid+1; i<256;i++)u1+=i*histogram[i];//C0组像素数与C1组像素数之和为图像总像素数。u1=u1/(sum-count0);w1=1-w0;//两组间的方差devi[tid]=w0*w1*(u1-u0)*(u1-u0);}static __global__ void kernel_ReductionMax(float* devi,int* thresh){//申请共享内存,存在于每个block中 __shared__ float Max[256*2];//__shared__ int index[256]; //确定索引 int tid = threadIdx.x; // partialindex[tid]=0; //传global memory数据到shared memory if(tid<256)Max[tid]=devi[tid];elseMax[tid]=tid-256;//初始化索引//index[256]=tid; //传输同步 __syncthreads(); //在共享存储器中进行规约,记录索引变化 for(int stride = 128; stride > 0; stride/=2) { if(tid<stride) {if(Max[tid]<Max[tid+stride]){Max[tid]=Max[tid+stride];Max[tid+256]=Max[tid+stride+256];//index[tid]=index[tid+stride];}} __syncthreads(); } //将当前block的计算结果写回输出数组 if(tid==256)thresh[0]=Max[256]; }unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount){//打开文件FILE *fp=fopen(bmpName,"rb");if(fp==0) return 0;//跳过文件头fseek(fp, sizeof(BITMAPFILEHEADER),0);//读入信息头int w, h, b;BITMAPINFOHEADER head;fread(&head, sizeof(BITMAPINFOHEADER), 1,fp); w = head.biWidth;h = head.biHeight;b = head.biBitCount/8;int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数//跳过颜色表 (颜色表的大小为1024)(彩色图像并没有颜色表,不需要这一步)if(b==1)fseek(fp, 1024,1);//图像数据unsigned char *imgBuf=new unsigned char[w * h * b];for(int i=0;i<h;i++){fread(imgBuf+i*w*b,w*b, 1,fp);fseek(fp, lineByte-w*b, 1);}fclose(fp);*width=w, *height=h, *byteCount=b;return imgBuf;}bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount){if(!imgBuf)return 0;//灰度图像颜色表空间1024,彩色图像没有颜色表int palettesize=0;if(byteCount==1) palettesize=1024;//一行象素字节数为4的倍数int lineByte=(width * byteCount+3)/4*4;FILE *fp=fopen(bmpName,"wb");if(fp==0) return 0;//填写文件头BITMAPFILEHEADER fileHead;fileHead.bfType = 0x4D42;fileHead.bfSize= sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height;fileHead.bfReserved1 = 0;fileHead.bfReserved2 = 0;fileHead.bfOffBits=54+palettesize;fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp);// 填写信息头BITMAPINFOHEADER head; head.biBitCount=byteCount*8;head.biClrImportant=0;head.biClrUsed=0;head.biCompression=0;head.biHeight=height;head.biPlanes=1;head.biSize=40;head.biSizeImage=lineByte*height;head.biWidth=width;head.biXPelsPerMeter=0;head.biYPelsPerMeter=0;fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp);//颜色表拷贝 if(palettesize==1024){unsigned char palette[1024];for(int i=0;i<256;i++){*(palette+i*4+0)=i;*(palette+i*4+1)=i;*(palette+i*4+2)=i;*(palette+i*4+3)=0; }fwrite(palette, 1024,1, fp);}//准备数据并写文件unsigned char *buf=new unsigned char[height*lineByte];for(int i=0;i<height;i++){for(int j=0;j<width*byteCount; j++)*(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j);}fwrite(buf, height*lineByte, 1, fp);delete []buf;fclose(fp);return 1;}
0 0
- CUDA大津阈值—原子加统计直方图&规约求最大值索引
- CUDA中的直方图统计
- cuda直方图计算——利用shared memory统计直方图
- C#——求最大值的索引
- 基于CUDA的图像亮度直方图统计
- 求最大值、索引、排序总结
- 我的CUDA学习之旅3——图像灰度化、灰度直方图统计
- 求最大值最小值以及第K大值问题(顺序统计量问题)
- 索引——直方图
- 索引——直方图
- CUDA Thrust 规约求和
- 直方图阈值法
- awk统计命令(求和、求平均、求最大值、求最小值)
- 求数组中的最大值和最大值的索引
- 大表数据加索引,加字段
- 自适应阈值算法(大津阈值法)
- 转载贴--自适应阈值:大津阈值法完全实现
- 求数组的最大值、第二大值
- 几个PHP数组处理函数
- SpringMVC定时任务注解实现@Schedule
- SolrCloud初识
- 动态规划简要笔记
- CentOS7安装MariaDB10.X
- CUDA大津阈值—原子加统计直方图&规约求最大值索引
- TabIndex
- java基础入门-poi解析excel
- 利用桥接模式如何在局域网里远程连接CentOS(VMWare虚拟机)
- 链表的全部操作——创建、插入、查找、删除、计算长度
- Hadoop配置Yarn
- Java反射知识
- Hadoop配置PATH环境变量
- Java琐碎小知识(二)