c,汇编速度的比较
来源:互联网 发布:淘宝店铺综合排名查询 编辑:程序博客网 时间:2024/05/28 23:09
#include "opencv2/highgui/highgui.hpp"
#include <chrono>
#include <arm_neon.h>
#include "opencv2/imgproc/imgproc.hpp"
//extern "C" void fanicCvtBGR2GrayNEON(void *pDst, void *pSrc, unsigned int size);
extern "C" void sfNEONbad1(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//single register
extern "C" void sfNEONbad2(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//serveral registers
extern "C" void sfNEONbad3(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count);//preload
class q_timer {
public:
void start()
{
m_start = std::chrono::steady_clock::now();
}
double stop()
{
std::chrono::steady_clock::time_point end = std::chrono::steady_clock::now();
return std::chrono::duration_cast<std::chrono::duration<double> >(end - m_start).count();
}
void time_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f Seconds.\n", disp, stop() / nr_frame);
}
void fps_display(const char *disp = "", int nr_frame = 1)
{
printf("Running time (%s) is: %5.5f frame per second.\n", disp, (double)nr_frame / stop());
}
private:
std::chrono::steady_clock::time_point m_start;
};
{
int i;
uint8x8_t rfac = vdup_n_u8(77); // 转换权值 R
uint8x8_t gfac = vdup_n_u8(151); // 转换权值 G
uint8x8_t bfac = vdup_n_u8(28); // 转换权值 B
n /= 8;
for (i = 0; i<n; i++)
{
uint16x8_t temp;
uint8x8x3_t rgb = vld3_u8(src);
uint8x8_t result;
temp = vmull_u8(rgb.val[0], rfac); // vmull_u8 每个字节(8bit)对应相乘,结果为每个单位2字节(16bit)
temp = vmlal_u8(temp, rgb.val[1], gfac); // 每个比特对应相乘并加上
temp = vmlal_u8(temp, rgb.val[2], bfac);
result = vshrn_n_u16(temp, 8); // 全部移位8位
vst1_u8(dest, result); // 转存运算结果
src += 8 * 3;
dest += 8;
}
}
void sfC(unsigned short * pDst, short * pSrc, short coeff, short intercept, unsigned int count)
{
int res;
do {
res = *pSrc++ * coeff + intercept;
if (res & 0x80) res += 256;
res >>= 8;
if (res < 0) res = 0;
if (res>0xffff) res = 0xffff;
*pDst++ = (unsigned short) res;
} while (--count);
}
using namespace cv;
int main()
{
cv::Mat src = cv::imread("rgb.png", 0);
cv::Mat dst = cv::Mat(src.rows, src.cols, CV_8UC1);
q_timer time;time.start();
short coeff = 1;
short intercept = 1;
unsigned int count = src.rows*src.cols / 2;
time.start();
for(int i=0;i<100;i++)
//fanicCvtBGR2GrayNEON(dst.data, src.data, src.rows*src.cols);
sfC((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("c ");
time.start();
for(int i=0;i<100;i++)
sfNEONbad1((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("single register");
time.start();
for(int i=0;i<100;i++)
sfNEONbad2((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("serveral registers");
time.start();
for(int i=0;i<100;i++)
sfNEONbad3((unsigned short * )dst.data, (short *) src.data, coeff, intercept, count);
time.time_display("preload");
/*
//Mat dst = Mat(src.size(), CV_8U);
//q_timer time;
time.start();
for(int i=0;i<20;i++)
neon_convert(dst.data, src.data, src.cols*src.rows);
time.time_display("neon ");
time.start();
Mat dst1;
for(int i=0;i<20;i++)
cvtColor(src, dst1, CV_BGR2GRAY);
time.time_display("no neon");
*/
cv::imwrite("gray.png", dst);
cv::imwrite("rgb1.png", src);
return 1;
}
a.s
.text
.global sfNEONbad1
pDst .req r0
pSrc .req r1
coeff .req r2
interc .req r3
count .req r12
.align 2
sfNEONbad1:
.cfi_startproc
ldr count, [sp]
vdup.16 d0, coeff
vdup.16 d1, interc
1:
vld1.16 {d2}, [pSrc]!
vmull.s16 q1, d2, d0[0]
vaddw.s16 q1, q1, d1
vqrshrun.s32 d2, q1, #8
vst1.16 {d2}, [pDst]!
subs count, count, #4
bgt 1b
bx lr
.cfi_endproc
b.s
.text
.arm
.global sfNEONbad2
pDst .req r0
pSrc .req r1
coeff .req r2
interc .req r3
count .req r12
.align 2
sfNEONbad2:
.cfi_startproc
ldr count, [sp]
vdup.16 d0, r2
vdup.16 d1, r3
1:
vld1.16 {d28-d31}, [pSrc,:128]!
vmull.s16 q12, d28, d0[0]
vmull.s16 q13, d29, d0[0]
vmull.s16 q14, d30, d0[0]
vmull.s16 q15, d31, d0[0]
vaddw.s16 q12, q12, d1
vaddw.s16 q13, q13, d1
vaddw.s16 q14, q14, d1
vaddw.s16 q15, q15, d1
vqrshrun.s32 d24, q12, #8
vqrshrun.s32 d25, q13, #8
vqrshrun.s32 d26, q14, #8
vqrshrun.s32 d27, q15, #8
subs count, count, #16
vst1.16 {d24-d27}, [pDst,:128]!
bgt 1b
bx lr
.cfi_endproc
c.s
.text
.arm
.global sfNEONbad3
pDst .req r0
pSrc .req r1
coeff .req r2
interc .req r3
count .req r12
.align 2
sfNEONbad3:
.cfi_startproc
pld [pSrc]
pld [pSrc, #64*1]
pld [pSrc, #64*2]
ldr count, [sp]
vdup.16 d0, r2
vdup.16 d1, r3
1:
vld1.16 {d28-d31}, [pSrc,:128]!
vmull.s16 q12, d28, d0[0]
vmull.s16 q13, d29, d0[0]
vmull.s16 q14, d30, d0[0]
vmull.s16 q15, d31, d0[0]
vaddw.s16 q12, q12, d1
vaddw.s16 q13, q13, d1
vaddw.s16 q14, q14, d1
vaddw.s16 q15, q15, d1
vqrshrun.s32 d24, q12, #8
vqrshrun.s32 d25, q13, #8
vqrshrun.s32 d26, q14, #8
vqrshrun.s32 d27, q15, #8
subs count, count, #16
vst1.16 {d24-d27}, [pDst,:128]!
bgt 1b
bx lr
.cfi_endproc
amt6757_wifi_n:/data # ./t
Running time (c ) is: 1.54038 Seconds.
Running time (single register) is: 1.58046 Seconds.
Running time (serveral registers) is: 1.24378 Seconds.
Running time (preload) is: 1.25389 Seconds.
- c,汇编速度的比较
- C语言的字符串比较与汇编的对照
- 采用汇编和C两种语言,分别求取CPU当前的速度
- C/C++/Perl/汇编/Java效率比较
- C/C++/Perl/汇编/Java效率比较
- C/C++/Perl/汇编/Java效率比较
- 数组拷贝的速度比较
- 发送消息的速度比较
- FORCAL与C/C++、MATLAB、Python、Lua等各种语言的速度比较
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- std::vector与std::list的执行速度比较 (C/C++) (STL)
- Lu与C/C++、Forcal、MATLAB、Python、Lua等各种语言的速度比较
- Keil中C语言汇编代码比较
- 快排的三种实现,以及三种实现运算速度的比较 c语言
- SSL2844 2017年11月8日提高组T2 奇怪的队列(线段树贪心)
- 程序出错
- [备忘录]-Nginx配置文件详解
- POJ
- Relative atomic mass
- c,汇编速度的比较
- 大学数学视频教程整理
- python 将数据写入excel
- 5.2
- linux软件包
- 电脑中安装多个Tomcat,如何配置(已经解决)
- 表达式计算
- switch 根据键盘录入成绩 显示分数及判断等级(第三次)
- 跳马(象棋)