双线性插值算法ARM NEON优化
来源:互联网 发布:php重庆招聘贴吧 编辑:程序博客网 时间:2024/06/06 03:14
C语言版本双线性插值算法
inline double bilinear_interp(double x, double y, double v11, double v12, double v21, double v22) { return (v11 * (1 - y) + v12 * y) * (1 - x) + (v21 * (1 - y) + v22 * y) * x;}
使用ARM NOEN优化后的双线性插值版本
inline uint8x8_t bilinear_interp_NEON(double x, double y, uint8x8_t v11,uint8x8_t v12, uint8x8_t v21,uint8x8_t v22){ uint16x8_t v11_16 = vmovl_u8(v11); uint16x8_t v12_16 = vmovl_u8(v12); uint16x8_t v21_16 = vmovl_u8(v21); uint16x8_t v22_16 = vmovl_u8(v22); ///convert v11 to two float32x4 uint16x4_t v_16_low = vget_low_u16(v11_16); uint16x4_t v_16_high = vget_high_u16(v11_16); uint32x4_t v_32_low = vmovl_u16(v_16_low); uint32x4_t v_32_high = vmovl_u16(v_16_high); float32x4_t v11_32f_low = vcvtq_f32_u32(v_32_low); float32x4_t v11_32f_high = vcvtq_f32_u32(v_32_high); //v12 v_16_low = vget_low_u16(v12_16); v_16_high = vget_high_u16(v12_16); v_32_low = vmovl_u16(v_16_low); v_32_high = vmovl_u16(v_16_high); float32x4_t v12_32f_low = vcvtq_f32_u32(v_32_low); float32x4_t v12_32f_high = vcvtq_f32_u32(v_32_high); //v21 v_16_low = vget_low_u16(v21_16); v_16_high = vget_high_u16(v21_16); v_32_low = vmovl_u16(v_16_low); v_32_high = vmovl_u16(v_16_high); float32x4_t v21_32f_low = vcvtq_f32_u32(v_32_low); float32x4_t v21_32f_high = vcvtq_f32_u32(v_32_high); //v22 v_16_low = vget_low_u16(v22_16); v_16_high = vget_high_u16(v22_16); v_32_low = vmovl_u16(v_16_low); v_32_high = vmovl_u16(v_16_high); float32x4_t v22_32f_low = vcvtq_f32_u32(v_32_low); float32x4_t v22_32f_high = vcvtq_f32_u32(v_32_high); float32_t fx = (float32_t)x; float32_t fy = (float32_t)y; float32_t one_fx = 1-fx; float32_t one_fy = 1-fy; float32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp; uint32x4_t result_32_low,result_32_high; uint16x4_t result_16_low,result_16_high; //for low 32x4 tmp1 = vmulq_n_f32(v11_32f_low, one_fy); tmp2 = vmulq_n_f32(v12_32f_low, fy); tmp3 = vaddq_f32(tmp1, tmp2); tmp4 = vmulq_n_f32(tmp3, one_fx); tmp1 = vmulq_n_f32(v21_32f_low, one_fy); tmp2 = vmulq_n_f32(v22_32f_low, fy); tmp3 = vaddq_f32(tmp1, tmp2); tmp5 = vmulq_n_f32(tmp3, fx); tmp = vaddq_f32(tmp4, tmp5); result_32_low = vcvtq_u32_f32(tmp); result_16_low = vqmovn_u32(result_32_low); //for high 32x4 tmp1 = vmulq_n_f32(v11_32f_high, one_fy); tmp2 = vmulq_n_f32(v12_32f_high, fy); tmp3 = vaddq_f32(tmp1, tmp2); tmp4 = vmulq_n_f32(tmp3, one_fx); tmp1 = vmulq_n_f32(v21_32f_high, one_fy); tmp2 = vmulq_n_f32(v22_32f_high, fy); tmp3 = vaddq_f32(tmp1, tmp2); tmp5 = vmulq_n_f32(tmp3, fx); tmp = vaddq_f32(tmp4, tmp5); result_32_high = vcvtq_u32_f32(tmp); result_16_high = vqmovn_u32(result_32_high); uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high); uint8x8_t result_8 = vqmovn_u16(result_16); return result_8; }
使用ARM NEON后,一次可以处理8个像素,成倍提高了运行的速度。实践中需要特别注意对边界的处理(行的开始和结尾处)。
到这里我们还不能满足。要追求更快!!!
注意上面的代码中虽然用了ARM NEON,但是在ARM 指令集的操作中用到了浮点操作。所以,还可以继续使用浮点数定点化的优化方式,优化后的代码如下:
inline uint8x8_t bilinear_interp_NEON_FixedPoint(double x, double y, uint8x8_t v11,uint8x8_t v12, uint8x8_t v21,uint8x8_t v22){ uint16x8_t v11_16 = vmovl_u8(v11); uint16x8_t v12_16 = vmovl_u8(v12); uint16x8_t v21_16 = vmovl_u8(v21); uint16x8_t v22_16 = vmovl_u8(v22); uint16x4_t v_16_low = vget_low_u16(v11_16); uint16x4_t v_16_high = vget_high_u16(v11_16); uint32x4_t v11_32_low = vmovl_u16(v_16_low); uint32x4_t v11_32_high = vmovl_u16(v_16_high); v_16_low = vget_low_u16(v12_16); v_16_high = vget_high_u16(v12_16); uint32x4_t v12_32_low = vmovl_u16(v_16_low); uint32x4_t v12_32_high = vmovl_u16(v_16_high); v_16_low = vget_low_u16(v21_16); v_16_high = vget_high_u16(v21_16); uint32x4_t v21_32_low = vmovl_u16(v_16_low); uint32x4_t v21_32_high = vmovl_u16(v_16_high); v_16_low = vget_low_u16(v22_16); v_16_high = vget_high_u16(v22_16); uint32x4_t v22_32_low = vmovl_u16(v_16_low); uint32x4_t v22_32_high = vmovl_u16(v_16_high); unsigned int intX = x*4096; unsigned int intY = y*4096; unsigned int one_x = 4096-intX; unsigned int one_y = 4096-intY; uint32_t intX_32 = (uint32_t) intX; uint32_t intY_32 = (uint32_t) intY; uint32_t oneX_32 = (uint32_t) one_x; uint32_t oneY_32 = (uint32_t) one_y; uint32x4_t tmp1,tmp2,tmp3,tmp4,tmp5,tmp; uint16x4_t result_16_low, result_16_high; //for low 4 numbers tmp1 = vmulq_n_u32(v11_32_low,oneY_32); tmp2 = vmulq_n_u32(v12_32_low, intY_32); tmp3 = vaddq_u32(tmp1, tmp2); tmp4 = vmulq_n_u32(tmp3, oneX_32); tmp1 = vmulq_n_u32(v21_32_low, oneY_32); tmp2 = vmulq_n_u32(v22_32_low, intY_32); tmp3 = vaddq_u32(tmp1, tmp2); tmp5 = vmulq_n_u32(tmp3, intX_32); tmp = vaddq_u32(tmp4, tmp5); result_16_low = vshrn_n_u32(tmp,16); //shift right 16 bytes result_16_low = vrshr_n_u16(result_16_low,8); //shift right 8 bytes, totally 24 bytes //for high 4 numbers tmp1 = vmulq_n_u32(v11_32_high,oneY_32); tmp2 = vmulq_n_u32(v12_32_high, intY_32); tmp3 = vaddq_u32(tmp1, tmp2); tmp4 = vmulq_n_u32(tmp3, oneX_32); tmp1 = vmulq_n_u32(v21_32_high, oneY_32); tmp2 = vmulq_n_u32(v22_32_high, intY_32); tmp3 = vaddq_u32(tmp1, tmp2); tmp5 = vmulq_n_u32(tmp3, intX_32); tmp = vaddq_u32(tmp4, tmp5); result_16_high = vshrn_n_u32(tmp,16); //shift right 16 bytes result_16_high = vrshr_n_u16(result_16_high,8); //shift right 8 bytes, totally 24 bytes uint16x8_t result_16 = vcombine_u16(result_16_low,result_16_high); uint8x8_t result_8 = vqmovn_u16(result_16); return result_8;}
加入浮点定点化之后的优化,时间能进一步提升一倍左右。
0 0
- 双线性插值算法ARM NEON优化
- 双线性插值算法ARM NEON优化
- ARM NEON 优化
- arm neon 优化原理
- ARM NEON 优化
- ARM NEON编译优化
- ARM vfp neon 浮点优化
- ARM NEON 编程系列8——ARM NEON 优化
- openCV3 双线性插值算法及优化
- -00-neon汇编优化实例讲解【ARM NEON加速】
- neon实现图像缩放算法(双线性插值法)
- neon优化二维卷积算法
- 转贴ARM NEON 优化的例子
- neon指令集(arm平台优化)
- 图像缩放算法及速度优化 ---- 双线性插值
- 图像处理界双线性插值算法的优化
- 图像处理界双线性插值算法的优化
- 图像处理界双线性插值算法的优化
- valgrind 的使用简介
- 顺序表经典面试题
- Java日期格式转化
- 现代通信网复习资料(第三章:分组交换原理)
- 13.C++ 构造函数、析构函数
- 双线性插值算法ARM NEON优化
- [ASP.NET]Dapper小型ORM的使用
- iterator相关
- Android Studio 主题、字体大小的设置
- Java设计模式(8)结构型:代理模式
- 【中级】2017项目集成管理工程师备考第二章-信息系统服务管理(中)
- Xcode8配置支持 ios 10.2
- Quagga安装过程记录
- 对机器学习与数据竞赛的一些总结