neon 如何快速实现44的整数倍的resize area

来源:互联网 发布:java 集合类 深入详解 编辑:程序博客网 时间:2024/06/14 09:13
static void resizeline4_32(uchar *src1, uchar *src2,uchar *src3, uchar *src4, uchar *dest,int src_width){    int w;    for (w = 0; w < src_width; w += 32) {        uint16x8_t line0 = vpaddlq_u8(vld1q_u8(src1));        uint16x8_t line1 = vpaddlq_u8(vld1q_u8(src2));uint16x8_t line2 = vpaddlq_u8(vld1q_u8(src3));        uint16x8_t line3 = vpaddlq_u8(vld1q_u8(src4));uint16x8_t line01 = vpaddlq_u8(vld1q_u8(src1+16));        uint16x8_t line11 = vpaddlq_u8(vld1q_u8(src2+16));uint16x8_t line21 = vpaddlq_u8(vld1q_u8(src3+16));        uint16x8_t line31 = vpaddlq_u8(vld1q_u8(src4+16));        uint16x8_t ab0 = vaddq_u16(line0, line1);uint16x8_t ab1 = vaddq_u16(line2, line3);uint16x8_t ab =  vaddq_u16(ab0, ab1);uint32x4_t result = vpaddlq_u16(ab);uint16x8_t ab01 = vaddq_u16(line01, line11);uint16x8_t ab11 = vaddq_u16(line21, line31);uint16x8_t ab_ =  vaddq_u16(ab01, ab11);uint32x4_t result_ = vpaddlq_u16(ab_);uint16x4_t resulta = vshrn_n_u32(result, 4);uint16x4_t resultb = vshrn_n_u32(result_, 4);uint16x8_t combile_result = vcombine_u16(resulta,resultb);        vst1_u8(dest, vmovn_u16(combile_result));        src1 += 32;        src2 += 32;src3 += 32;        src4 += 32;        dest += 8;    }}void neon_resize_44(uchar * src, uchar * dest,int src_width,int src_height,int dst_width,int dst_height){//LOGE("src_width:%d int src_height:%d int dst_width :%d int dst_height :%d",src_width,src_height,dst_width,dst_height);    for (int h = 0; h < dst_height; h++)    {    //LOGE("neon_resize_44 h :%d",h);    #if 0        resizeline4(src + src_width * (h * 4 + 0), src + src_width * (h * 4 + 1),        src + src_width * (h * 4 + 2), src + src_width * (h * 4 + 3), dest + dst_width * h,src_width);#else        resizeline4_32(src + src_width * (h * 4 + 0), src + src_width * (h * 4 + 1),        src + src_width * (h * 4 + 2), src + src_width * (h * 4 + 3), dest + dst_width * h,src_width);#endif    }}大致的思路是一行处理32个数据 再将4行数据相加,然后求取平均值。