memchr SSE 加速

来源:互联网 发布:tcp端口有哪些 编辑:程序博客网 时间:2024/06/05 12:44

memchr_sse.s

.text.globl memchr_sse; .align 4,0x90;memchr_sse:movd%rsi, %xmm1mov%rdi, %rcxpunpcklbw %xmm1, %xmm1test%rdx, %rdxjzL_return_nullpunpcklbw %xmm1, %xmm1and$63, %rcxpshufd$0, %xmm1, %xmm1cmp$48, %rcxjaL_crosscachemovdqu(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matches_1sub$16, %rdxjbeL_return_nulladd$16, %rdiand$15, %rcxand$-16, %rdiadd%rcx, %rdxsub$64, %rdxjbeL_exit_loopjmpL_loop_prolog.p2align 4L_crosscache:and$15, %rcxand$-16, %rdimovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0/* Check if there is a match.  */pmovmskb %xmm0, %eax/* Remove the leading bytes.  */sar%cl, %eaxtest%eax, %eaxjeL_unaligned_no_match/* Check which byte is a match.  */bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulladd%rdi, %raxadd%rcx, %raxret.p2align 4L_unaligned_no_match:add%rcx, %rdxsub$16, %rdxjbeL_return_nulladd$16, %rdisub$64, %rdxjbeL_exit_loop.p2align 4L_loop_prolog:movdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32movdqa48(%rdi), %xmm4pcmpeqb%xmm1, %xmm4add$64, %rdipmovmskb %xmm4, %eaxtest%eax, %eaxjnzL_matches0test$0x3f, %rdijzL_align64_loopsub$64, %rdxjbeL_exit_loopmovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32movdqa48(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxadd$64, %rditest%eax, %eaxjnzL_matches0mov%rdi, %rcxand$-64, %rdiand$63, %rcxadd%rcx, %rdx.p2align 4L_align64_loop:sub$64, %rdxjbeL_exit_loopmovdqa(%rdi), %xmm0movdqa16(%rdi), %xmm2movdqa32(%rdi), %xmm3movdqa48(%rdi), %xmm4pcmpeqb%xmm1, %xmm0pcmpeqb%xmm1, %xmm2pcmpeqb%xmm1, %xmm3pcmpeqb%xmm1, %xmm4pmaxub%xmm0, %xmm3pmaxub%xmm2, %xmm4pmaxub%xmm3, %xmm4pmovmskb %xmm4, %eaxadd$64, %rditest%eax, %eaxjzL_align64_loopsub$64, %rdipmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchespmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pcmpeqb48(%rdi), %xmm1pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32pmovmskb %xmm1, %eaxbsf%eax, %eaxlea48(%rdi, %rax), %raxret.p2align 4L_exit_loop:add$32, %rdxjleL_exit_loop_32movdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matchesmovdqa16(%rdi), %xmm2pcmpeqb%xmm1, %xmm2pmovmskb %xmm2, %eaxtest%eax, %eaxjnzL_matches16movdqa32(%rdi), %xmm3pcmpeqb%xmm1, %xmm3pmovmskb %xmm3, %eaxtest%eax, %eaxjnzL_matches32_1sub$16, %rdxjleL_return_nullpcmpeqb48(%rdi), %xmm1pmovmskb %xmm1, %eaxtest%eax, %eaxjnzL_matches48_1xor%rax, %raxret.p2align 4L_exit_loop_32:add$32, %rdxmovdqa(%rdi), %xmm0pcmpeqb%xmm1, %xmm0pmovmskb %xmm0, %eaxtest%eax, %eaxjnzL_matches_1sub$16, %rdxjbeL_return_nullpcmpeqb16(%rdi), %xmm1pmovmskb %xmm1, %eaxtest%eax, %eaxjnzL_matches16_1xor%rax, %raxret.p2align 4L_matches0:bsf%eax, %eaxlea-16(%rax, %rdi), %raxret.p2align 4L_matches:bsf%eax, %eaxadd%rdi, %raxret.p2align 4L_matches16:bsf%eax, %eaxlea16(%rax, %rdi), %raxret.p2align 4L_matches32:bsf%eax, %eaxlea32(%rax, %rdi), %raxret.p2align 4L_matches_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulladd%rdi, %raxret.p2align 4L_matches16_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea16(%rdi, %rax), %raxret.p2align 4L_matches32_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea32(%rdi, %rax), %raxret.p2align 4L_matches48_1:bsf%eax, %eaxsub%rax, %rdxjbeL_return_nulllea48(%rdi, %rax), %raxret.p2align 4L_return_null:xor%rax, %raxret.type memchr_sse, @function;.size memchr_sse, .-memchr_sse;

测试stub

stub.c

#include <stdio.h>#include <stdlib.h>#include <string.h>#include <unistd.h>#include <stdint.h>#include "common.h"extern void *memchr_sse(const void *s, int c, size_t n);int main(int argc, char **argv){char text[1024] = {0};void *result = NULL;uint64_t begin, end;memset(text, 'A', 1024);text[1022] = '\r';begin = get_cycle_count();//result = memchr_sse(text, '\r', 1024);result = memchr(text, '\r', 1024);end = get_cycle_count();if (result){printf("result @ %u cost %lu\n", result - (void *)text, end - begin);}return 0;}


编译

gcc -march=corei7 -O3 memchr_sse.s stub.c -o stub

测试平台:

Intel(R) Xeon(R) CPU E31230 @ 3.20GHz

memchr 测试结果

result @ 1022 cost 1404
# ./stub
result @ 1022 cost 1600
# ./stub
result @ 1022 cost 1452
# ./stub
result @ 1022 cost 1388
# ./stub
result @ 1022 cost 1440

memchr_sse 测试结果
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 568
# ./stub
result @ 1022 cost 572
# ./stub
result @ 1022 cost 612
# ./stub
result @ 1022 cost 524
# ./stub
result @ 1022 cost 520