CUDA总结:opencv cuda模块高斯滤波函数分析
来源:互联网 发布:海信网络 经理 张四海 编辑:程序博客网 时间:2024/06/03 07:58
基于opencv 3.1
相关接口
cv::Ptr<cv::cuda::Filter> // 滤波器对象指针,位于opencv2\cudafilters.hppcv::cuda::createGaussianFilter // 创建高斯滤波器对象,位于opencv2\cudafilters.hppcv::cuda::Filter::apply // 滤波器实现,位于opencv2\cudafilters.hppclass SeparableLinearFilter // 独立线性滤波器对象,位于sources\modules\cudafilters\src\filtering.cppfilter::linearRow //线性行滤波函数,位于sources\modules\cudafilters\src\cuda\row_filter.hppfilter::linearColumn //线性行滤波函数,位于sources\modules\cudafilters\src\cuda\row_filter.hpp
其中,SeparableLinearFilter 通过一个函数指针数组区分不同的输入数据类型:
//filtering.cpp typedef void (*func_t)(PtrStepSzb src, PtrStepSzb dst, const float* kernel, int ksize, int anchor, int brd_type, int cc, cudaStream_t stream);static const func_t rowFilterFuncs[7][4] = //支持1、3、4通道图像 { {filter::linearRow<uchar, float>, 0, filter::linearRow<uchar3, float3>, filter::linearRow<uchar4, float4>}, //8位无符号整型数据 {0, 0, 0, 0}, //!8位有符号不支持 {filter::linearRow<ushort, float>, 0, filter::linearRow<ushort3, float3>, filter::linearRow<ushort4, float4>}, //16位无符号整型数据 {filter::linearRow<short, float>, 0, filter::linearRow<short3, float3>, filter::linearRow<short4, float4>}, //16位有符号整型数据 {filter::linearRow<int, float>, 0, filter::linearRow<int3, float3>, filter::linearRow<int4, float4>}, //32位有符号整型数据 {filter::linearRow<float, float>, 0, filter::linearRow<float3, float3>, filter::linearRow<float4, float4>}, //单精度浮点数据 {0, 0, 0, 0} //!双精度浮点不支持 };
然后,filter::linearRow又通过一个函数指针数组往下一层调用:
static const caller_t callers[5][33] = { { 0, row_filter::caller< 1, T, D, BrdRowConstant>, row_filter::caller< 2, T, D, BrdRowConstant>, row_filter::caller< 3, T, D, BrdRowConstant>, row_filter::caller< 4, T, D, BrdRowConstant>, row_filter::caller< 5, T, D, BrdRowConstant>, row_filter::caller< 6, T, D, BrdRowConstant>, row_filter::caller< 7, T, D, BrdRowConstant>, row_filter::caller< 8, T, D, BrdRowConstant>, row_filter::caller< 9, T, D, BrdRowConstant>, row_filter::caller<10, T, D, BrdRowConstant>, row_filter::caller<11, T, D, BrdRowConstant>, row_filter::caller<12, T, D, BrdRowConstant>, row_filter::caller<13, T, D, BrdRowConstant>, row_filter::caller<14, T, D, BrdRowConstant>, row_filter::caller<15, T, D, BrdRowConstant>, row_filter::caller<16, T, D, BrdRowConstant>, row_filter::caller<17, T, D, BrdRowConstant>, row_filter::caller<18, T, D, BrdRowConstant>, row_filter::caller<19, T, D, BrdRowConstant>, row_filter::caller<20, T, D, BrdRowConstant>, row_filter::caller<21, T, D, BrdRowConstant>, row_filter::caller<22, T, D, BrdRowConstant>, row_filter::caller<23, T, D, BrdRowConstant>, row_filter::caller<24, T, D, BrdRowConstant>, row_filter::caller<25, T, D, BrdRowConstant>, row_filter::caller<26, T, D, BrdRowConstant>, row_filter::caller<27, T, D, BrdRowConstant>, row_filter::caller<28, T, D, BrdRowConstant>, row_filter::caller<29, T, D, BrdRowConstant>, row_filter::caller<30, T, D, BrdRowConstant>, row_filter::caller<31, T, D, BrdRowConstant>, row_filter::caller<32, T, D, BrdRowConstant> }, ... };
好了,到这里就进入到cuda代码了,raow_filter::caller的实现:
//row_filter.hpp template <int KSIZE, typename T, typename D, template<typename> class B> //KSIZE代表,T代表输入数据类型,D代表输出数据类型,B代表边界处理(border) void caller(PtrStepSz<T> src, PtrStepSz<D> dst, int anchor, int cc, cudaStream_t stream) { int BLOCK_DIM_X; int BLOCK_DIM_Y; int PATCH_PER_BLOCK; //!设置线程块中的线程数目,确保硬件利用率达到100% if (cc >= 20) { //!计算能力>=2.0,线程数=256 BLOCK_DIM_X = 32; BLOCK_DIM_Y = 8; PATCH_PER_BLOCK = 4; //最大支持的通道数=4 } else { //!计算能力<2.0,线程数=128 BLOCK_DIM_X = 32; BLOCK_DIM_Y = 4; PATCH_PER_BLOCK = 4; } const dim3 block(BLOCK_DIM_X, BLOCK_DIM_Y); const dim3 grid(divUp(src.cols, BLOCK_DIM_X * PATCH_PER_BLOCK), divUp(src.rows, BLOCK_DIM_Y)); B<T> brd(src.cols); // linearRowFilter<KSIZE, T, D><<<grid, block, 0, stream>>>(src, dst, anchor, brd); cudaSafeCall( cudaGetLastError() ); if (stream == 0) cudaSafeCall( cudaDeviceSynchronize() ); }
其中,gridDim.x需要按照blockDim.x*4对齐,gridDim.y需要按照blockDim.y对齐,通过divUp函数:
//sources\modules\core\include\opencv2\core\cuda\common.hpp __host__ __device__ __forceinline__ int divUp(int total, int grain) { return (total + grain - 1) / grain; }
边界处理类,在后面的kernel函数中用到:
//sources\modules\core\include\opencv2\core\cuda\border_interpolate.hpp template <typename D> struct BrdRowConstant { typedef D result_type; explicit __host__ __device__ __forceinline__ BrdRowConstant(int width_, const D& val_ = VecTraits<D>::all(0)) : width(width_), val(val_) {} template <typename T> __device__ __forceinline__ D at_low(int x, const T* data) const { return x >= 0 ? saturate_cast<D>(data[x]) : val; } template <typename T> __device__ __forceinline__ D at_high(int x, const T* data) const { return x < width ? saturate_cast<D>(data[x]) : val; } template <typename T> __device__ __forceinline__ D at(int x, const T* data) const { return (x >= 0 && x < width) ? saturate_cast<D>(data[x]) : val; } int width; D val; };
高斯滤波的kernel函数:
//row_filter.hpp #define MAX_KERNEL_SIZE 32 //最大支持的滤波核大小 __constant__ float c_kernel[MAX_KERNEL_SIZE]; //滤波核(模板),由外部输入 template <int KSIZE, typename T, typename D, typename B> __global__ void linearRowFilter(const PtrStepSz<T> src, PtrStep<D> dst, const int anchor, const B brd) { #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 200) const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 8; const int PATCH_PER_BLOCK = 4; const int HALO_SIZE = 1; #else const int BLOCK_DIM_X = 32; const int BLOCK_DIM_Y = 4; const int PATCH_PER_BLOCK = 4; const int HALO_SIZE = 1; #endif typedef typename TypeVec<float, VecTraits<T>::cn>::vec_type sum_t; __shared__ sum_t smem[BLOCK_DIM_Y][(PATCH_PER_BLOCK + 2 * HALO_SIZE) * BLOCK_DIM_X]; //计算卷积时,每个像素都要跟滤波核的所有元素相乘一次,多次使用的变量用共享内存缓存 const int y = blockIdx.y * BLOCK_DIM_Y + threadIdx.y; //计算二维矩阵中的坐标y,即第y行 if (y >= src.rows) return; const T* src_row = src.ptr(y); //第y行的首指针 const int xStart = blockIdx.x * (PATCH_PER_BLOCK * BLOCK_DIM_X) + threadIdx.x; //计算二维矩阵中的坐标x,即第x列 if (blockIdx.x > 0) { //Load left halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart - (HALO_SIZE - j) * BLOCK_DIM_X]); } else { //Load left halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_low(xStart - (HALO_SIZE - j) * BLOCK_DIM_X, src_row)); } if (blockIdx.x + 2 < gridDim.x) { //Load main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + j * BLOCK_DIM_X]); //Load right halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(src_row[xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X]); } else { //Load main data #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + j * BLOCK_DIM_X, src_row)); //Load right halo #pragma unroll for (int j = 0; j < HALO_SIZE; ++j) smem[threadIdx.y][threadIdx.x + (PATCH_PER_BLOCK + HALO_SIZE) * BLOCK_DIM_X + j * BLOCK_DIM_X] = saturate_cast<sum_t>(brd.at_high(xStart + (PATCH_PER_BLOCK + j) * BLOCK_DIM_X, src_row)); } __syncthreads(); #pragma unroll for (int j = 0; j < PATCH_PER_BLOCK; ++j) { const int x = xStart + j * BLOCK_DIM_X; if (x < src.cols) { sum_t sum = VecTraits<sum_t>::all(0); #pragma unroll for (int k = 0; k < KSIZE; ++k) sum = sum + smem[threadIdx.y][threadIdx.x + HALO_SIZE * BLOCK_DIM_X + j * BLOCK_DIM_X - anchor + k] * c_kernel[k]; //像素与滤波核每个元素逐一相乘,并累加权值 dst(y, x) = saturate_cast<D>(sum); } } }
0 0
- CUDA总结:opencv cuda模块高斯滤波函数分析
- CUDA总结:opencv图像金字塔函数分析
- CUDA实现图像的高斯滤波(opencv实现)
- OpenCV中GPU模块(CUDA)函数
- OpenCV中GPU模块(CUDA)函数
- 转:Opencv高斯滤波函数分析
- CUDA二维纹理内存+OpenCV图像滤波
- cuda 常用函数总结
- opencv----滤波函数:方框滤波、均值滤波、高斯滤波、中值滤波、双边滤波
- cuda & opencv
- opencv cuda
- cuda opencv
- OpenCV高斯滤波效率分析
- OpenCV中的GPU模块、CUDA加速
- openCV 中的高斯滤波GaussianBlur函数
- Opencv高斯滤波-GaussianBlur函数
- CUDA卷积操作—使用constant memory实现高斯滤波
- CUDA卷积操作—使用constant memory实现高斯滤波
- 数据库传入中文乱码
- Android Fragment 懒加载
- 二分查找套路
- 【气球】关于气球动力与控制的设想
- 专题3-ARM汇编指令详解
- CUDA总结:opencv cuda模块高斯滤波函数分析
- 斐波拉契数列
- Webpack-模块化系统简述
- ionic2混合app开发系列1-环境搭建和ionic2项目目录介绍
- 回首2016,展望2017
- 单例模式
- Leetcode118. Pascal's Triangle (杨辉三角)
- scala 学习(九)——元组和咬合操作
- [LeetCode]Top K Frequent Elements(Java)