CUDA大津阈值—原子加统计直方图&规约求最大值索引

来源：互联网发布：地牢猎手5淘宝刷钻编辑：程序博客网时间：2024/06/05 15:17
在大津阈值的CUDA实现中有三个需要注意的地方：
（1）使用atomicAdd操作实现直方图的统计；
atomicAdd简介：原子操作一般要实现读取-修改-写入三个步骤，并且在执行过程中不会被打断。即除非已经完成了这三个操作，否则其他线程都不能读取或写入x的值。
在计算直方图时，存在一个问题，多个线程可能对输出直方图的同一元素进行递增，这时候我们需要使用原子操作完成。
（2）借助sharedmemory规约求和
（3）使用二维shared memory规约求最大值，下图为二维shared memory的示意图。
下图为大津阈值的前后对比图：
详细介绍见附录代码及注释。
GPU代码如下：
#include "cuda_runtime.h"#include "device_launch_parameters.h"#include <stdio.h>#include "Windows.h"#include <math.h>#include<iostream>using namespace std;#define BLOCKDIM_X16#define BLOCKDIM_Y16#define GRIDDIM_X256#define GRIDDIM_Y256#define MASK_WIDTH5unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount);bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount);static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf);static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram);static __global__ void kernel_ReductionSum(int *d_a, int* d_sum);static __global__ void kernel_Variance(int *histogram, float* devi, int sum);static __global__ void kernel_ReductionMax(float* devi, int* thresh);void main(){//查看显卡配置struct cudaDeviceProp pror;cudaGetDeviceProperties(&pror,0);cout<<"maxThreadsPerBlock="<<pror.maxThreadsPerBlock<<endl;long start, end;long time = 0; //CUDA计时函数start = GetTickCount();cudaEvent_t startt,stop; //CUDA计时机制cudaEventCreate(&startt);cudaEventCreate(&stop);cudaEventRecord(startt,0);unsigned char *h_src_imgbuf;  //图像指针int width, height, byteCount;char rootPath1[]="C:\\Users\\a404\\Desktop\\测试图片\\";char readPath[1024];int frame=300;for (int k=1;k<=frame;k++){sprintf(readPath, "%s%d.bmp", rootPath1, k);h_src_imgbuf=readBmp(readPath, &width, &height, &byteCount);//printf("宽=%d，高=%d，字节=%d\n",width, height, byteCount);int size1=width*height *byteCount*sizeof(unsigned char);int size2=width*height *sizeof(unsigned char);//输出图像内存-host端unsigned char *h_guassian_imgbuf=new unsigned char[width*height*byteCount];unsigned char *h_gray_imgbuf=new unsigned char[width*height];unsigned char *h_Otus_imgbuf=new unsigned char[width*height];//分配显存空间unsigned char *d_src_imgbuf;unsigned char *d_gray_imgbuf;unsigned char *d_Otus_imgbuf;int *d_histogram;float *d_devi;int *d_sum;int *d_thresh;float *d_maxdevi;cudaMalloc((void**)&d_src_imgbuf, size1);cudaMalloc((void**)&d_gray_imgbuf, size2);cudaMalloc((void**)&d_Otus_imgbuf, size2);cudaMalloc((void**)&d_histogram, 256*sizeof(int));cudaMalloc((void**)&d_devi, 256*sizeof(float));cudaMalloc((void**)&d_sum, sizeof(int));cudaMalloc((void**)&d_thresh, sizeof(int));//cudaMalloc((void**)&d_maxdevi, sizeof(float));//把数据从Host传到DevicecudaMemcpy(d_src_imgbuf, h_src_imgbuf, size1, cudaMemcpyHostToDevice);int bx = ceil((double)width/BLOCKDIM_X); //网格和块的分配int by = ceil((double)height/BLOCKDIM_Y);if(bx > GRIDDIM_X) bx = GRIDDIM_X;if(by > GRIDDIM_Y) by = GRIDDIM_Y;dim3 grid(bx, by);//网格的结构dim3 block(BLOCKDIM_X, BLOCKDIM_Y);//块的结构//kernel--灰度变换kernel_color2gray<<<grid, block>>>(width,height,byteCount,d_src_imgbuf,d_gray_imgbuf);cudaMemcpy(h_gray_imgbuf, d_gray_imgbuf,size2, cudaMemcpyDeviceToHost);//数据传回主机端//统计直方图int h_histogram[256]={0};//使用原子操作进行直方图统计cudaMemcpy(d_histogram, h_histogram, 256*sizeof(int), cudaMemcpyHostToDevice);kernel_histogram<<<grid, block>>>(width,height,d_gray_imgbuf,d_histogram);cudaMemcpy(h_histogram, d_histogram, 256*sizeof(int), cudaMemcpyDeviceToHost);  //方差及最大方差float h_devi[256]={0}, maxDevi=0;  //像素总数int h_sum[1]={0};//规约求和--借助shared memorykernel_ReductionSum<<<1, 256>>>(d_histogram,d_sum);cudaMemcpy(h_sum, d_sum, sizeof(int), cudaMemcpyDeviceToHost);//遍历求方差--启用256个线程cudaMemcpy(d_devi, h_devi, size1, cudaMemcpyHostToDevice);kernel_Variance<<<1, 256>>>(d_histogram,d_devi,h_sum[0]);cudaMemcpy(h_devi, d_devi, 256*sizeof(float), cudaMemcpyDeviceToHost);//最佳阈值threshint h_thresh[1]={0};//规约求最大值，并记录最大值位置--启用256*2个线程kernel_ReductionMax<<<1, 512>>>(d_devi,d_thresh);cudaMemcpy(h_thresh, d_thresh, sizeof(int), cudaMemcpyDeviceToHost);for(int i=0;i<height;i++)for(int j=0;j<width;j++){int value=*(h_gray_imgbuf+i*width+j);if(value<=h_thresh[0])*(h_Otus_imgbuf+i*width+j)=0;else*(h_Otus_imgbuf+i*width+j)=255;}char rootPath2[]="C:\\Users\\a404\\Desktop\\测试结果\\";char writePath[1024];sprintf(writePath, "%s%d.bmp", rootPath2, k);//saveBmp(writePath, h_guassian_imgbuf, width, height, byteCount);saveBmp(writePath, h_Otus_imgbuf, width, height, 1);cout<<k<<"  "<<((float)k/frame)*100<<"%"<<endl;//释放内存cudaFree(d_src_imgbuf); cudaFree(d_gray_imgbuf);cudaFree(d_Otus_imgbuf);cudaFree(d_histogram);cudaFree(d_sum);cudaFree(d_thresh);delete []h_src_imgbuf;delete []h_gray_imgbuf;delete []h_Otus_imgbuf;}end = GetTickCount();InterlockedExchangeAdd(&time, end - start);cout << "Total time GPU:";cout << time << endl;int x;cin>>x;}static __global__ void kernel_color2gray(int width,int height,int byteCount,unsigned char *d_src_imgbuf,unsigned char *d_gray_imgbuf){const int tix = blockDim.x * blockIdx.x + threadIdx.x;const int tiy = blockDim.y * blockIdx.y + threadIdx.y;const int threadTotalX = blockDim.x * gridDim.x;const int threadTotalY = blockDim.y * gridDim.y;for(int ix = tix; ix < height; ix += threadTotalX)for(int iy = tiy; iy < width; iy += threadTotalY)*(d_gray_imgbuf+ix*width+iy)=0.11**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+0)+0.59**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+1)+0.30**(d_src_imgbuf+ix*width*byteCount+iy*byteCount+2)+0.5;}static __global__ void kernel_histogram(int width,int height,unsigned char* d_gray_imgbuf,int* histogram){__shared__ int temp[256];//直方图初始化为0temp[threadIdx.x*16+threadIdx.y] = 0;__syncthreads();const int tix = blockDim.x * blockIdx.x + threadIdx.x;const int tiy = blockDim.y * blockIdx.y + threadIdx.y;const int threadTotalX = blockDim.x * gridDim.x;const int threadTotalY = blockDim.y * gridDim.y;for(int ix = tix; ix < height; ix += threadTotalX){for(int iy = tiy; iy < width; iy += threadTotalY){atomicAdd(&(temp[d_gray_imgbuf[ix*width+iy]]), 1);}}__syncthreads();atomicAdd(&(histogram[threadIdx.x*16+threadIdx.y]), temp[threadIdx.x*16+threadIdx.y]);}static __global__ void kernel_ReductionSum(int *d_a, int* d_sum)  {      //申请共享内存，存在于每个block中       __shared__ int partialSum[256];        //确定索引       int tid = threadIdx.x;        //传global memory数据到shared memory partialSum[tid]=d_a[tid];      //传输同步      __syncthreads();            //在共享存储器中进行规约      for(int stride = 128; stride > 0; stride/=2)      {          if(tid<stride) partialSum[tid]+=partialSum[tid+stride];          __syncthreads();      }      //将当前block的计算结果写回输出数组      if(tid==0)            d_sum[0] = partialSum[0];  }  static __global__ void kernel_Variance(int *histogram, float* devi, int sum){int tid = threadIdx.x; //c0和c1组的均值float u0,u1;  //c0和c1组产生的概率float w0,w1; //c0组的像素数int count0; //阈值t//int t;  u0=0;count0=0;for(int i=0; i<=tid;i++){u0 += i*histogram[i];count0 += histogram[i];}u0=u0/count0;w0=(float)count0/sum;//计算阈值为t时，c1组的均值和产生的概率u1=0;for(int i=tid+1; i<256;i++)u1+=i*histogram[i];//C0组像素数与C1组像素数之和为图像总像素数。u1=u1/(sum-count0);w1=1-w0;//两组间的方差devi[tid]=w0*w1*(u1-u0)*(u1-u0);}static __global__ void kernel_ReductionMax(float* devi,int* thresh){//申请共享内存，存在于每个block中       __shared__ float Max[256*2];//__shared__ int index[256];      //确定索引       int tid = threadIdx.x;     // partialindex[tid]=0;    //传global memory数据到shared memory if(tid<256)Max[tid]=devi[tid];elseMax[tid]=tid-256;//初始化索引//index[256]=tid;    //传输同步      __syncthreads();            //在共享存储器中进行规约，记录索引变化    for(int stride = 128; stride > 0; stride/=2)      {          if(tid<stride) {if(Max[tid]<Max[tid+stride]){Max[tid]=Max[tid+stride];Max[tid+256]=Max[tid+stride+256];//index[tid]=index[tid+stride];}}        __syncthreads();      }      //将当前block的计算结果写回输出数组  if(tid==256)thresh[0]=Max[256]; }unsigned char *readBmp(char *bmpName, int *width, int *height, int *byteCount){//打开文件FILE *fp=fopen(bmpName,"rb");if(fp==0) return 0;//跳过文件头fseek(fp, sizeof(BITMAPFILEHEADER),0);//读入信息头int w, h, b;BITMAPINFOHEADER head;fread(&head, sizeof(BITMAPINFOHEADER), 1,fp); w = head.biWidth;h = head.biHeight;b = head.biBitCount/8;int lineByte=(w * b+3)/4*4; //每行的字节数为4的倍数//跳过颜色表 （颜色表的大小为1024）（彩色图像并没有颜色表，不需要这一步）if(b==1)fseek(fp, 1024,1);//图像数据unsigned char *imgBuf=new unsigned char[w * h * b];for(int i=0;i<h;i++){fread(imgBuf+i*w*b,w*b, 1,fp);fseek(fp, lineByte-w*b, 1);}fclose(fp);*width=w,  *height=h, *byteCount=b;return imgBuf;}bool saveBmp(char *bmpName, unsigned char *imgBuf, int width, int height, int byteCount){if(!imgBuf)return 0;//灰度图像颜色表空间1024，彩色图像没有颜色表int palettesize=0;if(byteCount==1) palettesize=1024;//一行象素字节数为4的倍数int lineByte=(width * byteCount+3)/4*4;FILE *fp=fopen(bmpName,"wb");if(fp==0) return 0;//填写文件头BITMAPFILEHEADER fileHead;fileHead.bfType = 0x4D42;fileHead.bfSize= sizeof(BITMAPFILEHEADER)+sizeof(BITMAPINFOHEADER)+ palettesize + lineByte*height;fileHead.bfReserved1 = 0;fileHead.bfReserved2 = 0;fileHead.bfOffBits=54+palettesize;fwrite(&fileHead, sizeof(BITMAPFILEHEADER),1, fp);// 填写信息头BITMAPINFOHEADER head; head.biBitCount=byteCount*8;head.biClrImportant=0;head.biClrUsed=0;head.biCompression=0;head.biHeight=height;head.biPlanes=1;head.biSize=40;head.biSizeImage=lineByte*height;head.biWidth=width;head.biXPelsPerMeter=0;head.biYPelsPerMeter=0;fwrite(&head, sizeof(BITMAPINFOHEADER),1, fp);//颜色表拷贝  if(palettesize==1024){unsigned char palette[1024];for(int i=0;i<256;i++){*(palette+i*4+0)=i;*(palette+i*4+1)=i;*(palette+i*4+2)=i;*(palette+i*4+3)=0;     }fwrite(palette, 1024,1, fp);}//准备数据并写文件unsigned char *buf=new unsigned char[height*lineByte];for(int i=0;i<height;i++){for(int j=0;j<width*byteCount; j++)*(buf+i*lineByte+j)=*(imgBuf+i*width*byteCount+j);}fwrite(buf, height*lineByte, 1, fp);delete []buf;fclose(fp);return 1;}
0 0