c,汇编速度的比较

来源:互联网 发布:淘宝店铺综合排名查询 编辑:程序博客网 时间:2024/05/28 23:09
#include "opencv2/core/core.hpp"
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
//extern "C" void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
extern "C" void sfNEONbad1(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//single register
extern "C" void sfNEONbad2(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//serveral registers
extern "C" void sfNEONbad3(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//preload
class q_timer { 
public: 
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return  std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private: 
std::chrono::steady_clock::time_point m_start;

 };


void neon_convert(uint8_t * __restrict dest, uint8_t * __restrict src, int n)
{
int i;
uint8x8_t rfac = vdup_n_u8(77);       // 转换权值  R      
uint8x8_t gfac = vdup_n_u8(151);    // 转换权值  G      
uint8x8_t bfac = vdup_n_u8(28);      // 转换权值  B      
n /= 8;
for (i = 0; i<n; i++)
{
uint16x8_t  temp;
uint8x8x3_t rgb = vld3_u8(src);
uint8x8_t result;
temp = vmull_u8(rgb.val[0], rfac);       // vmull_u8 每个字节(8bit)对应相乘,结果为每个单位2字节(16bit)      
temp = vmlal_u8(temp, rgb.val[1], gfac);  // 每个比特对应相乘并加上      
temp = vmlal_u8(temp, rgb.val[2], bfac);
result = vshrn_n_u16(temp, 8);  // 全部移位8位      
vst1_u8(dest, result);   // 转存运算结果      
src += 8 * 3;
dest += 8;
}
}

void sfC(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count)
{
 int res;
 do {
  res = *pSrc++ * coeff + intercept;
  if (res & 0x80) res += 256;
  res >>= 8;
  if (res < 0) res = 0;
  if (res>0xffff) res = 0xffff;
  *pDst++ = (unsigned short) res;
 } while (--count);
}

using namespace cv;
int main()
{
cv::Mat src = cv::imread("rgb.png", 0);
cv::Mat dst = cv::Mat(src.rows, src.cols, CV_8UC1);
q_timer time;time.start();
    short coeff = 1;
short intercept = 1;
unsigned int count = src.rows*src.cols / 2;
time.start();
for(int i=0;i<100;i++)
//fanicCvtBGR2GrayNEON(dst.data, src.data, src.rows*src.cols);
sfC((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("c ");
time.start();
for(int i=0;i<100;i++)
sfNEONbad1((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("single register");
time.start();
for(int i=0;i<100;i++)
sfNEONbad2((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("serveral registers");
time.start();
for(int i=0;i<100;i++)
sfNEONbad3((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("preload");
/*
//Mat dst = Mat(src.size(), CV_8U);
//q_timer time;
time.start();
for(int i=0;i<20;i++)
neon_convert(dst.data, src.data, src.cols*src.rows);
time.time_display("neon ");
    time.start();
Mat dst1;
for(int i=0;i<20;i++)
cvtColor(src, dst1, CV_BGR2GRAY);
time.time_display("no neon");
*/
cv::imwrite("gray.png", dst);
cv::imwrite("rgb1.png", src);
return 1;

}


a.s

.text
.global sfNEONbad1
 
pDst    .req    r0
pSrc    .req    r1
coeff   .req    r2
interc  .req    r3
count   .req    r12
 
.align 2
sfNEONbad1:
.cfi_startproc
 ldr     count, [sp]
 vdup.16 d0, coeff
 vdup.16 d1, interc
 
 1:
  vld1.16   {d2}, [pSrc]!
  vmull.s16  q1, d2, d0[0]
  vaddw.s16  q1, q1, d1
  vqrshrun.s32 d2, q1, #8
  vst1.16   {d2}, [pDst]!
  subs   count, count, #4
 bgt     1b
bx      lr
.cfi_endproc


b.s

.text
.arm
.global sfNEONbad2
pDst    .req    r0
pSrc    .req    r1
coeff   .req    r2
interc  .req    r3
count   .req    r12
.align 2
sfNEONbad2:
.cfi_startproc
 ldr     count, [sp]
 vdup.16 d0, r2 
 vdup.16 d1, r3 
 1:
  vld1.16  {d28-d31}, [pSrc,:128]!
  vmull.s16   q12, d28, d0[0]
  vmull.s16   q13, d29, d0[0]
  vmull.s16   q14, d30, d0[0]
  vmull.s16   q15, d31, d0[0]
  vaddw.s16   q12, q12, d1
  vaddw.s16   q13, q13, d1
  vaddw.s16   q14, q14, d1
  vaddw.s16   q15, q15, d1
  vqrshrun.s32    d24, q12, #8
  vqrshrun.s32    d25, q13, #8
  vqrshrun.s32    d26, q14, #8
  vqrshrun.s32    d27, q15, #8
  subs    count, count, #16
  vst1.16  {d24-d27}, [pDst,:128]!
 bgt     1b
 bx      lr
.cfi_endproc


c.s

.text
.arm
.global sfNEONbad3
 
pDst    .req    r0
pSrc    .req    r1
coeff   .req    r2
interc  .req    r3
count   .req    r12
 
.align 2
sfNEONbad3:
.cfi_startproc
 pld  [pSrc]
 pld  [pSrc, #64*1]
 pld  [pSrc, #64*2]
 ldr     count, [sp]
 vdup.16 d0, r2 
 vdup.16 d1, r3 
 
 1:
  vld1.16  {d28-d31}, [pSrc,:128]!
  vmull.s16   q12, d28, d0[0]
  vmull.s16   q13, d29, d0[0]
  vmull.s16   q14, d30, d0[0]
  vmull.s16   q15, d31, d0[0]
 
  vaddw.s16   q12, q12, d1
  vaddw.s16   q13, q13, d1
  vaddw.s16   q14, q14, d1
  vaddw.s16   q15, q15, d1
 
  vqrshrun.s32    d24, q12, #8
  vqrshrun.s32    d25, q13, #8
  vqrshrun.s32    d26, q14, #8
  vqrshrun.s32    d27, q15, #8
 
  subs    count, count, #16
  vst1.16  {d24-d27}, [pDst,:128]!
 bgt     1b
 
 bx      lr
.cfi_endproc

amt6757_wifi_n:/data # ./t
Running time (c ) is: 1.54038 Seconds.
Running time (single register) is: 1.58046 Seconds.
Running time (serveral registers) is: 1.24378 Seconds.
Running time (preload) is: 1.25389 Seconds.

原创粉丝点击