neon优化二维卷积算法
来源:互联网 发布:苹果狙击镜软件 编辑:程序博客网 时间:2024/06/07 00:07
卷积在图像处理中使用很频繁,由于数据量大,计算多,未经优化的卷积算法很慢。利用neon的并行计算,可以对其进行优化。
未经优化的C语言实现:
bool convolve2DSlow(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY, float* kernel, int kernelSizeX, int kernelSizeY){ int i, j, m, n, mm, nn; int kCenterX, kCenterY; // center index of kernel float sum; // temp accumulation buffer int rowIndex, colIndex; // check validity of params if(!in || !out || !kernel) return false; if(dataSizeX <= 0 || kernelSizeX <= 0) return false; // find center position of kernel (half of kernel size) kCenterX = kernelSizeX / 2; kCenterY = kernelSizeY / 2; for(i=0; i < dataSizeY; ++i) // rows { for(j=0; j < dataSizeX; ++j) // columns { sum = 0; // init to 0 before sum for(m=0; m < kernelSizeY; ++m) // kernel rows { mm = kernelSizeY - 1 - m; // row index of flipped kernel for(n=0; n < kernelSizeX; ++n) // kernel columns { nn = kernelSizeX - 1 - n; // column index of flipped kernel // index of input signal, used for checking boundary rowIndex = i + m - kCenterY; colIndex = j + n - kCenterX; // ignore input samples which are out of bound if(rowIndex >= 0 && rowIndex < dataSizeY && colIndex >= 0 && colIndex < dataSizeX) sum += in[dataSizeX * rowIndex + colIndex] * kernel[kernelSizeX * mm + nn]; } } out[dataSizeX * i + j] = (unsigned char)((float)fabs(sum) + 0.5f); } } return true;}
neon优化
思路:neon可以一次处理4个32位数据,考虑到级、卷积核为float类型,灰度像素数据为uint8_t,neon可以一次读取8个像素数据,将像素数据转化为float后进行计算。因为neon只能按内存顺序读取固定数量(8个)数据,为了避免剩余数据和卷积计算中的边界问题,可以将像素矩阵用0填充扩展,每行的头尾分别加上kernelSizeX/2个0数据,并确保每行数据量为8的倍数,不是则添0不足(增加内存开销来减小代码复杂度和时间复杂度,因为优化目标主要在于提升时间上效率)
neon优化后的实现:
/*convolve2D_neon:neon优化的二维卷积*/int convolve2D_neon(unsigned char * src,unsigned char * dst,int datasizeX,int datasizeY,float* kernel,int kernelSizeX,int kernelSizeY){ if(!dst || !src || !kernel) return -1; if(datasizeX <= 0 || kernelSizeX <= 0) return -1; int x,y,kx,ky,k,mm,nn,yIndex,xIndex; //float k_num; float32x4_t k_neon; uint8x8_t neon_x_u8; uint16x8_t neon_x_u16_8; uint16x4_t neon_x_u16_4; uint16x4_t neon_x_u16_4_2; uint32x4_t neon_x_u32; float32x4_t neon_x_f32; float32x4_t neon_sum_f32_high; float32x4_t neon_sum_f32_low; int kCenterX, kCenterY; kCenterX = kernelSizeX >> 1; kCenterY = kernelSizeY >> 1; k=datasizeX%8; if(k) k=8-k; unsigned char* in=(unsigned char * )malloc((datasizeX+2*kCenterX+k)*(datasizeY+2*kCenterY)); //扩展,避免边界和剩余 int strde=datasizeX+2*kCenterX+k;//in的步长 memset(in,0,sizeof(in)); for(y=0;y<datasizeY;y++) { for(x=0;x<datasizeX;x++) in[(y+kCenterY)*(strde)+x+kCenterX]=src[y*datasizeX+x]; } for(y=0;y<datasizeY;y++) { for(x=0;x<datasizeX;x=x+8) { neon_sum_f32_high=vdupq_n_f32(0); neon_sum_f32_low =vdupq_n_f32(0); for(ky=0;ky<kernelSizeY;ky++) // kernel rows { mm = kernelSizeY - 1 - ky; // row index of flipped kernel for(kx=0; kx < kernelSizeX; ++kx) // kernel columns { nn = kernelSizeX - 1 - kx; // column index of flipped kernel yIndex = y + ky; xIndex = x + kx; neon_x_u8=vld1_u8(in+yIndex*(strde)+xIndex); //加载8个8位数据 neon_x_u16_8=vmovl_u8(neon_x_u8); k_neon=vld1q_dup_f32(kernel+kernelSizeX * mm + nn);//加载kernel[kernelSizeX * mm + nn] /*处理低位的4个数据*/ neon_x_u16_4=vget_low_u16(neon_x_u16_8); neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位 neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32 neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon /*处理高位4个数据*/ neon_x_u16_4=vget_high_u16(neon_x_u16_8); neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位 neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32 neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon } } neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low); //将float32*4*2转为uint8*8 neon_x_u16_4=vqmovn_u32(neon_x_u32); neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high); neon_x_u16_4_2=vqmovn_u32(neon_x_u32); neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2); neon_x_u8=vqmovn_u16(neon_x_u16_8); vst1_u8(dst+y*datasizeX+x,neon_x_u8); //存储 } } free(in); in=NULL; return 0;}
可分离的卷积计算:
如果二维的卷积核矩阵可以分解为两个一维矩阵相乘,那么可以对二维数先进行x方向(行)上的一维卷积计算,然后在进行y方向(列)的一维卷积计算
C/C++语言代码实现:
bool convolve2DSeparable(unsigned char* in, unsigned char* out, int dataSizeX, int dataSizeY, float* kernelX, int kSizeX, float* kernelY, int kSizeY){ int i, j, k, m, n; float *tmp, *sum; // intermediate data buffer unsigned char *inPtr, *outPtr; // working pointers float *tmpPtr, *tmpPtr2; // working pointers int kCenter, kOffset, endIndex; // kernel indice // check validity of params if(!in || !out || !kernelX || !kernelY) return false; if(dataSizeX <= 0 || kSizeX <= 0) return false; // allocate temp storage to keep intermediate result tmp = new float[dataSizeX * dataSizeY]; if(!tmp) return false; // memory allocation error // store accumulated sum sum = new float[dataSizeX]; if(!sum) return false; // memory allocation error // covolve horizontal direction /////////////////////// // find center position of kernel (half of kernel size) kCenter = kSizeX >> 1; // center index of kernel array endIndex = dataSizeX - kCenter; // index for full kernel convolution // init working pointers inPtr = in; tmpPtr = tmp; // store intermediate results from 1D horizontal convolution // start horizontal convolution (x-direction) for(i=0; i < dataSizeY; ++i) // number of rows { kOffset = 0; // starting index of partial kernel varies for each sample // COLUMN FROM index=0 TO index=kCenter-1 for(j=0; j < kCenter; ++j) { *tmpPtr = 0; // init to 0 before accumulation for(k = kCenter + kOffset, m = 0; k >= 0; --k, ++m) // convolve with partial of kernel { *tmpPtr += *(inPtr + m) * kernelX[k]; } ++tmpPtr; // next output ++kOffset; // increase starting index of kernel } // COLUMN FROM index=kCenter TO index=(dataSizeX-kCenter-1) for(j = kCenter; j < endIndex; ++j) { *tmpPtr = 0; // init to 0 before accumulate for(k = kSizeX-1, m = 0; k >= 0; --k, ++m) // full kernel { *tmpPtr += *(inPtr + m) * kernelX[k]; } ++inPtr; // next input ++tmpPtr; // next output } kOffset = 1; // ending index of partial kernel varies for each sample // COLUMN FROM index=(dataSizeX-kCenter) TO index=(dataSizeX-1) for(j = endIndex; j < dataSizeX; ++j) { *tmpPtr = 0; // init to 0 before accumulation for(k = kSizeX-1, m=0; k >= kOffset; --k, ++m) // convolve with partial of kernel { *tmpPtr += *(inPtr + m) * kernelX[k]; } ++inPtr; // next input ++tmpPtr; // next output ++kOffset; // increase ending index of partial kernel } inPtr += kCenter; // next row } // END OF HORIZONTAL CONVOLUTION ////////////////////// // start vertical direction /////////////////////////// // find center position of kernel (half of kernel size) kCenter = kSizeY >> 1; // center index of vertical kernel endIndex = dataSizeY - kCenter; // index where full kernel convolution should stop // set working pointers tmpPtr = tmpPtr2 = tmp; outPtr = out; // clear out array before accumulation for(i = 0; i < dataSizeX; ++i) sum[i] = 0; // start to convolve vertical direction (y-direction) // ROW FROM index=0 TO index=(kCenter-1) kOffset = 0; // starting index of partial kernel varies for each sample for(i=0; i < kCenter; ++i) { for(k = kCenter + kOffset; k >= 0; --k) // convolve with partial kernel { for(j=0; j < dataSizeX; ++j) { sum[j] += *tmpPtr * kernelY[k]; ++tmpPtr; } } for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out { // covert negative to positive *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f); sum[n] = 0; // reset to zero for next summing ++outPtr; // next element of output } tmpPtr = tmpPtr2; // reset input pointer ++kOffset; // increase starting index of kernel } // ROW FROM index=kCenter TO index=(dataSizeY-kCenter-1) for(i = kCenter; i < endIndex; ++i) { for(k = kSizeY -1; k >= 0; --k) // convolve with full kernel { for(j = 0; j < dataSizeX; ++j) { sum[j] += *tmpPtr * kernelY[k]; ++tmpPtr; } } for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out { // covert negative to positive *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f); sum[n] = 0; // reset for next summing ++outPtr; // next output } // move to next row tmpPtr2 += dataSizeX; tmpPtr = tmpPtr2; } // ROW FROM index=(dataSizeY-kCenter) TO index=(dataSizeY-1) kOffset = 1; // ending index of partial kernel varies for each sample for(i=endIndex; i < dataSizeY; ++i) { for(k = kSizeY-1; k >= kOffset; --k) // convolve with partial kernel { for(j=0; j < dataSizeX; ++j) { sum[j] += *tmpPtr * kernelY[k]; ++tmpPtr; } } for(n = 0; n < dataSizeX; ++n) // convert and copy from sum to out { // covert negative to positive *outPtr = (unsigned char)((float)fabs(sum[n]) + 0.5f); sum[n] = 0; // reset for next summing ++outPtr; // next output } // move to next row tmpPtr2 += dataSizeX; tmpPtr = tmpPtr2; // next input ++kOffset; // increase ending index of kernel } // END OF VERTICAL CONVOLUTION //////////////////////// // deallocate temp buffers delete [] tmp; delete [] sum; return true;}
可分离二维卷积的neon优化思路:先实现一维的卷积优化,思路相同,一样是填0扩展,避免剩余数据和边界问题,然后并行计算4个像素的卷积计算,由于neon只能一次读取顺序的8个uint8_t数据,只能进行x方向的卷积,所以需要进行矩阵转置(矩阵转置也可以用neon优化,网上说可以提速10倍)
neon优化实现:
/*convolve1D:一维卷积*/int convolve1D(unsigned char* src, unsigned char* dst,int dataSize, float* kernel, int kernelSize){ int kCenter=kernelSize>>1; int k=dataSize%8; unsigned char* inpre=src; int x,kx; if(k) k=8-k; unsigned char* in=(unsigned char*)malloc(dataSize+2*kCenter+k+8); //unsigned char* in=(unsigned char*)malloc(2048); memset(in,0,(dataSize+2*kCenter+k)); memcpy(in+kCenter,inpre,dataSize); /* for(x=0;x<dataSize;x++) in[x+kCenter]=inpre[x]; */ float32x4_t k_neon; uint8x8_t neon_x_u8; uint16x8_t neon_x_u16_8; uint16x4_t neon_x_u16_4; uint16x4_t neon_x_u16_4_2; uint32x4_t neon_x_u32; float32x4_t neon_x_f32; float32x4_t neon_sum_f32_high; float32x4_t neon_sum_f32_low; for(x=0;x<dataSize;x=x+8) { neon_sum_f32_high=vdupq_n_f32(0); neon_sum_f32_low =vdupq_n_f32(0); for(kx=0;kx<kernelSize;kx++) { neon_x_u8=vld1_u8(in+x+kx); //加载8个8位数据 neon_x_u16_8=vmovl_u8(neon_x_u8); //printf("kernelSize=%d kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]); k_neon=vld1q_dup_f32((&kernel[kernelSize-kx-1]));//加载kernelX[kSizeX-kx-1] //printf("kernelSize=%d kernel[%d]=%f\n",kernelSize,kx,kernel[kernelSize-kx-1]); /*处理低位的4个数据*/ neon_x_u16_4=vget_low_u16(neon_x_u16_8); neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位 neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32 neon_sum_f32_low=vmlaq_f32(neon_sum_f32_low,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon /*处理高位4个数据*/ neon_x_u16_4=vget_high_u16(neon_x_u16_8); neon_x_u32 = vmovl_u16(neon_x_u16_4); //将16位扩展为32位 neon_x_f32 = vcvtq_f32_u32(neon_x_u32);//转化为float32 neon_sum_f32_high=vmlaq_f32(neon_sum_f32_high,neon_x_f32,k_neon);//sum=sum+neon_x*k_neon } neon_x_u32=vcvtq_u32_f32(neon_sum_f32_low); //将float32*4*2转为uint8*8 neon_x_u16_4=vqmovn_u32(neon_x_u32); neon_x_u32=vcvtq_u32_f32(neon_sum_f32_high); neon_x_u16_4_2=vqmovn_u32(neon_x_u32); neon_x_u16_8=vcombine_u16(neon_x_u16_4,neon_x_u16_4_2); neon_x_u8=vqmovn_u16(neon_x_u16_8); vst1_u8(dst+x,neon_x_u8); } free(in); in=NULL; return 0;}/*convolve2DSeparable:neon优化的可分离二维卷积,先对行一维卷积,再对列进行一维卷积*/int convolve2DSeparable(unsigned char* src, unsigned char* dst, int dataSizeX, int dataSizeY, float* kernelX, int kSizeX, float* kernelY, int kSizeY){ int x,y; struct timeval start; struct timeval end; unsigned char* in_tmp=(unsigned char*)malloc(dataSizeX*dataSizeY); for(y=0;y<dataSizeY;y++) //一维卷积 convolve1D(src+y*dataSizeX,in_tmp+y*dataSizeX,dataSizeX,kernelX,kSizeX); for(y=0;y<dataSizeY;y++) //矩阵转置 { for(x=0;x<dataSizeX;x++) { dst[x*dataSizeY+y]=in_tmp[y*dataSizeX+x]; } } for(x=0;x<dataSizeX;x++) //一维卷积 { convolve1D(dst+x*dataSizeY,in_tmp+x*dataSizeY,dataSizeY,kernelY,kSizeY); } for(y=0;y<dataSizeY;y++) //矩阵转置输出 { for(x=0;x<dataSizeX;x++) { dst[y*dataSizeX+x]=in_tmp[x*dataSizeY+y]; } } free(in_tmp); return 0;}
测试:
测试图为1920*1080的灰度图,测试平台为海思3559板子,卷积核大小为5*5,所以值均为0.25(平均模糊卷积),分别进行普通编译和O3级优化编译neon优化代码和原来的c语言代码,进行运行时间比较
编译命令:
arm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -o neon_test1080 neon_test.carm-hisiv600-linux-g++ -mfloat-abi=softfp -mfpu=neon -O3 neon_test.c
测试结果
测试结果与平台以及编译器相关,经过不同平台的测试,O3优化的加速比在不同平台和编译器下不同,但基本能提速2-10倍,而neon相比于C的提速比,经测试,似乎与处理的图片大小有关,没有进一步测试。
- neon优化二维卷积算法
- 双线性插值算法ARM NEON优化
- 双线性插值算法ARM NEON优化
- 二维三次卷积插值算法
- 从一个复数点积算法看NEON的汇编优化(NEON优化实例)
- 二维卷积
- 二维卷积/矩阵卷积
- 卷积神经网络中的优化算法比较
- 卷积神经网络中的优化算法比较
- ARM NEON 优化
- arm neon 优化原理
- neon自动向量优化
- ARM NEON 优化
- ARM NEON编译优化
- CNN 卷积神经网络 二维卷积
- Gabor二维卷积运算
- 二维卷积的基本原理
- 二维卷积c代码
- 初学java:一球从100米高度自由落下,每次落地后反跳回原高度的一半,再落下,求它在第10次落地时,共经过多少米?第10次反弹多高?
- W3school的Html5新标签
- 获取NavigationView中的menu,改变menu子item的属性
- mysql查询上月记录数
- Codeforces Round #443 (Div. 2) C. Short Program
- neon优化二维卷积算法
- JAVA运算符优先级概念
- 多任务学习“Exploiting Multi-Grain Ranking Constraints for Precisely Searching Visually-similar Vehicles”
- scrapy爬虫注意点(2)—— 根据page循环遍历页面参数问题
- unity5 GI全局光照 问题集锦解答
- 曹可爱之最可爱-Vue.js入门(三)模版语法
- 如何给bigdecimal保留小数位
- C#中关闭第一个Form窗口,打开另一个新窗口方法
- Java学习笔记(2)