SSE2 memcpy

来源:互联网 发布:mysql timestamp 编辑:程序博客网 时间:2024/05/22 14:19

SSE2 memcpy

SSE2 provides functionality for performing faster on aligned memory. By copying the first and last bytes of an unaligned memory destination using the conventional unaligned functionality, and copying everything in between as aligned, it is possible to make use of this performance improvement on large unaligned memory blocks as well.

In this graph the green lines are the conventional memcpy available in Microsoft Visual Studio 2008, the red lines are the SSE memcpy function available in Nevrax NeL, and the blue lines are the custom SSE2 function. The bright colored lines are the performance on alinged memory blocks, while the dark colored lines are tested on differently unaligned blocks of memory. Horizontally the copy function is tested on different sizes of memory, on the vertical axis the copy speed is displayed in MB/s.

As you can see, NeL's SSE memcpy performs very well on aligned memory, but gives horrible performance on unaligned memory, as it does not take the aligning of the memory blocks into account. The builtin memcpy function is fastest of all at copying blocks below 128 bytes, but also reaches it's speed limit there. The SSE2 memcpy takes larger sizes to get to it's maximum performance, but peaks above NeL's aligned SSE memcpy even for unaligned memory blocks.

Code is available below, ask before using.SSE2 memcpy

[cpp] view plain copy
  1. void *memcpy_kaetemi_sse2(void *dst, void *src, int nBytes)  
  2. {  
  3.         __asm  
  4.         {  
  5.                 // Copyright (C) 2009  Jan Boon (Kaetemi)  
  6.                 // optimized on Intel Core 2 Duo T7500  
  7.                   
  8.                 mov         ecx, nBytes  
  9.                 mov         edi, dst  
  10.                 mov         esi, src  
  11.                 add         ecx, edi  
  12.   
  13.                 prefetchnta [esi]  
  14.                 prefetchnta [esi+32]  
  15.                 prefetchnta [esi+64]  
  16.                 prefetchnta [esi+96]  
  17.   
  18.                 // handle nBytes lower than 128  
  19.                 cmp         nBytes, 512  
  20.                 jge         fast  
  21. slow:  
  22.                 mov         bl, [esi]  
  23.                 mov         [edi], bl  
  24.                 inc         edi  
  25.                 inc         esi  
  26.                 cmp         ecx, edi  
  27.                 jnz         slow  
  28.                 jmp         end  
  29.   
  30. fast:  
  31.                 // align dstEnd to 128 bytes  
  32.                 and         ecx, 0xFFFFFF80  
  33.   
  34.                 // get srcEnd aligned to dstEnd aligned to 128 bytes  
  35.                 mov         ebx, esi  
  36.                 sub         ebx, edi  
  37.                 add         ebx, ecx  
  38.                   
  39.                 // skip unaligned copy if dst is aligned  
  40.                 mov         eax, edi  
  41.                 and         edi, 0xFFFFFF80  
  42.                 cmp         eax, edi  
  43.                 jne         first  
  44.                 jmp         more  
  45.   
  46. first:  
  47.                 // copy the first 128 bytes unaligned  
  48.                 movdqu      xmm0, [esi]  
  49.                 movdqu      xmm1, [esi+16]  
  50.                 movdqu      xmm2, [esi+32]  
  51.                 movdqu      xmm3, [esi+48]  
  52.                   
  53.                 movdqu      xmm4, [esi+64]  
  54.                 movdqu      xmm5, [esi+80]  
  55.                 movdqu      xmm6, [esi+96]  
  56.                 movdqu      xmm7, [esi+112]  
  57.                   
  58.                 movdqu      [eax], xmm0  
  59.                 movdqu      [eax+16], xmm1  
  60.                 movdqu      [eax+32], xmm2  
  61.                 movdqu      [eax+48], xmm3  
  62.                   
  63.                 movdqu      [eax+64], xmm4  
  64.                 movdqu      [eax+80], xmm5  
  65.                 movdqu      [eax+96], xmm6  
  66.                 movdqu      [eax+112], xmm7  
  67.                   
  68.                 // add 128 bytes to edi aligned earlier  
  69.                 add         edi, 128  
  70.                   
  71.                 // offset esi by the same value  
  72.                 sub         eax, edi  
  73.                 sub         esi, eax  
  74.                   
  75.                 // last bytes if dst at dstEnd  
  76.                 cmp         ecx, edi  
  77.                 jnz         more  
  78.                 jmp         last  
  79.                   
  80. more:  
  81.                 // handle equally aligned arrays  
  82.                 mov         eax, esi  
  83.                 and         eax, 0xFFFFFF80  
  84.                 cmp         eax, esi  
  85.                 jne         unaligned4k  
  86.                   
  87. aligned4k:  
  88.                 mov         eax, esi  
  89.                 add         eax, 4096  
  90.                 cmp         eax, ebx  
  91.                 jle         aligned4kin  
  92.                 cmp         ecx, edi  
  93.                 jne         alignedlast  
  94.                 jmp         last  
  95.                   
  96. aligned4kin:  
  97.                 prefetchnta [esi]  
  98.                 prefetchnta [esi+32]  
  99.                 prefetchnta [esi+64]  
  100.                 prefetchnta [esi+96]  
  101.                   
  102.                 add         esi, 128  
  103.                   
  104.                 cmp         eax, esi  
  105.                 jne         aligned4kin  
  106.   
  107.                 sub         esi, 4096  
  108.   
  109. alinged4kout:  
  110.                 movdqa      xmm0, [esi]  
  111.                 movdqa      xmm1, [esi+16]  
  112.                 movdqa      xmm2, [esi+32]  
  113.                 movdqa      xmm3, [esi+48]  
  114.                   
  115.                 movdqa      xmm4, [esi+64]  
  116.                 movdqa      xmm5, [esi+80]  
  117.                 movdqa      xmm6, [esi+96]  
  118.                 movdqa      xmm7, [esi+112]  
  119.                   
  120.                 movntdq     [edi], xmm0  
  121.                 movntdq     [edi+16], xmm1  
  122.                 movntdq     [edi+32], xmm2  
  123.                 movntdq     [edi+48], xmm3  
  124.                   
  125.                 movntdq     [edi+64], xmm4  
  126.                 movntdq     [edi+80], xmm5  
  127.                 movntdq     [edi+96], xmm6  
  128.                 movntdq     [edi+112], xmm7  
  129.                   
  130.                 add         esi, 128  
  131.                 add         edi, 128  
  132.                   
  133.                 cmp         eax, esi  
  134.                 jne         alinged4kout  
  135.                 jmp         aligned4k  
  136.   
  137. alignedlast:  
  138.                 mov         eax, esi  
  139.   
  140. alignedlastin:  
  141.                 prefetchnta [esi]  
  142.                 prefetchnta [esi+32]  
  143.                 prefetchnta [esi+64]  
  144.                 prefetchnta [esi+96]  
  145.                   
  146.                 add         esi, 128  
  147.                   
  148.                 cmp         ebx, esi  
  149.                 jne         alignedlastin  
  150.                   
  151.                 mov         esi, eax  
  152.   
  153. alignedlastout:  
  154.                 movdqa      xmm0, [esi]  
  155.                 movdqa      xmm1, [esi+16]  
  156.                 movdqa      xmm2, [esi+32]  
  157.                 movdqa      xmm3, [esi+48]  
  158.                   
  159.                 movdqa      xmm4, [esi+64]  
  160.                 movdqa      xmm5, [esi+80]  
  161.                 movdqa      xmm6, [esi+96]  
  162.                 movdqa      xmm7, [esi+112]  
  163.                   
  164.                 movntdq     [edi], xmm0  
  165.                 movntdq     [edi+16], xmm1  
  166.                 movntdq     [edi+32], xmm2  
  167.                 movntdq     [edi+48], xmm3  
  168.                   
  169.                 movntdq     [edi+64], xmm4  
  170.                 movntdq     [edi+80], xmm5  
  171.                 movntdq     [edi+96], xmm6  
  172.                 movntdq     [edi+112], xmm7  
  173.                   
  174.                 add         esi, 128  
  175.                 add         edi, 128  
  176.                   
  177.                 cmp         ecx, edi  
  178.                 jne         alignedlastout  
  179.                 jmp         last  
  180.   
  181. unaligned4k:  
  182.                 mov         eax, esi  
  183.                 add         eax, 4096  
  184.                 cmp         eax, ebx  
  185.                 jle         unaligned4kin  
  186.                 cmp         ecx, edi  
  187.                 jne         unalignedlast  
  188.                 jmp         last  
  189.   
  190. unaligned4kin:  
  191.                 prefetchnta [esi]  
  192.                 prefetchnta [esi+32]  
  193.                 prefetchnta [esi+64]  
  194.                 prefetchnta [esi+96]  
  195.                   
  196.                 add         esi, 128  
  197.                   
  198.                 cmp         eax, esi  
  199.                 jne         unaligned4kin  
  200.   
  201.                 sub         esi, 4096  
  202.   
  203. unalinged4kout:  
  204.                 movdqu      xmm0, [esi]  
  205.                 movdqu      xmm1, [esi+16]  
  206.                 movdqu      xmm2, [esi+32]  
  207.                 movdqu      xmm3, [esi+48]  
  208.                   
  209.                 movdqu      xmm4, [esi+64]  
  210.                 movdqu      xmm5, [esi+80]  
  211.                 movdqu      xmm6, [esi+96]  
  212.                 movdqu      xmm7, [esi+112]  
  213.                   
  214.                 movntdq     [edi], xmm0  
  215.                 movntdq     [edi+16], xmm1  
  216.                 movntdq     [edi+32], xmm2  
  217.                 movntdq     [edi+48], xmm3  
  218.                   
  219.                 movntdq     [edi+64], xmm4  
  220.                 movntdq     [edi+80], xmm5  
  221.                 movntdq     [edi+96], xmm6  
  222.                 movntdq     [edi+112], xmm7  
  223.                   
  224.                 add         esi, 128  
  225.                 add         edi, 128  
  226.                   
  227.                 cmp         eax, esi  
  228.                 jne         unalinged4kout  
  229.                 jmp         unaligned4k  
  230.   
  231. unalignedlast:  
  232.                 mov         eax, esi  
  233.   
  234. unalignedlastin:  
  235.                 prefetchnta [esi]  
  236.                 prefetchnta [esi+32]  
  237.                 prefetchnta [esi+64]  
  238.                 prefetchnta [esi+96]  
  239.                   
  240.                 add         esi, 128  
  241.                   
  242.                 cmp         ebx, esi  
  243.                 jne         unalignedlastin  
  244.                   
  245.                 mov         esi, eax  
  246.   
  247. unalignedlastout:  
  248.                 movdqu      xmm0, [esi]  
  249.                 movdqu      xmm1, [esi+16]  
  250.                 movdqu      xmm2, [esi+32]  
  251.                 movdqu      xmm3, [esi+48]  
  252.                   
  253.                 movdqu      xmm4, [esi+64]  
  254.                 movdqu      xmm5, [esi+80]  
  255.                 movdqu      xmm6, [esi+96]  
  256.                 movdqu      xmm7, [esi+112]  
  257.                   
  258.                 movntdq     [edi], xmm0  
  259.                 movntdq     [edi+16], xmm1  
  260.                 movntdq     [edi+32], xmm2  
  261.                 movntdq     [edi+48], xmm3  
  262.                   
  263.                 movntdq     [edi+64], xmm4  
  264.                 movntdq     [edi+80], xmm5  
  265.                 movntdq     [edi+96], xmm6  
  266.                 movntdq     [edi+112], xmm7  
  267.                   
  268.                 add         esi, 128  
  269.                 add         edi, 128  
  270.                   
  271.                 cmp         ecx, edi  
  272.                 jne         unalignedlastout  
  273.                 jmp         last  
  274.                   
  275. last:  
  276.                 // get the last 128 bytes  
  277.                 mov         ecx, nBytes  
  278.                 mov         edi, dst  
  279.                 mov         esi, src  
  280.                 add         edi, ecx  
  281.                 add         esi, ecx  
  282.                 sub         edi, 128  
  283.                 sub         esi, 128  
  284.   
  285.                 // copy the last 128 bytes unaligned  
  286.                 movdqu      xmm0, [esi]  
  287.                 movdqu      xmm1, [esi+16]  
  288.                 movdqu      xmm2, [esi+32]  
  289.                 movdqu      xmm3, [esi+48]  
  290.                   
  291.                 movdqu      xmm4, [esi+64]  
  292.                 movdqu      xmm5, [esi+80]  
  293.                 movdqu      xmm6, [esi+96]  
  294.                 movdqu      xmm7, [esi+112]  
  295.                   
  296.                 movdqu      [edi], xmm0  
  297.                 movdqu      [edi+16], xmm1  
  298.                 movdqu      [edi+32], xmm2  
  299.                 movdqu      [edi+48], xmm3  
  300.                   
  301.                 movdqu      [edi+64], xmm4  
  302.                 movdqu      [edi+80], xmm5  
  303.                 movdqu      [edi+96], xmm6  
  304.                 movdqu      [edi+112], xmm7  
  305.   
  306. end:  
  307.         }  
  308.         return dst;  
  309. }  

0 0
原创粉丝点击