SSE2 memcpy

来源：互联网发布：mysql timestamp 编辑：程序博客网时间：2024/05/22 14:19
SSE2 memcpy

By Kaetemi on Sunday 25 October 2009, 17:56 - Articles - Permalink
asm
code
memcpy
nel
programming
sse2
SSE2 provides functionality for performing faster on aligned memory. By copying the first and last bytes of an unaligned memory destination using the conventional unaligned functionality, and copying everything in between as aligned, it is possible to make use of this performance improvement on large unaligned memory blocks as well.
In this graph the green lines are the conventional memcpy available in Microsoft Visual Studio 2008, the red lines are the SSE memcpy function available in Nevrax NeL, and the blue lines are the custom SSE2 function. The bright colored lines are the performance on alinged memory blocks, while the dark colored lines are tested on differently unaligned blocks of memory. Horizontally the copy function is tested on different sizes of memory, on the vertical axis the copy speed is displayed in MB/s.
As you can see, NeL's SSE memcpy performs very well on aligned memory, but gives horrible performance on unaligned memory, as it does not take the aligning of the memory blocks into account. The builtin memcpy function is fastest of all at copying blocks below 128 bytes, but also reaches it's speed limit there. The SSE2 memcpy takes larger sizes to get to it's maximum performance, but peaks above NeL's aligned SSE memcpy even for unaligned memory blocks.
Code is available below, ask before using. SSE2 memcpy
[cpp] view plain copy
void *memcpy_kaetemi_sse2(void *dst, void *src, int nBytes)  
{  
        __asm  
        {  
                // Copyright (C) 2009  Jan Boon (Kaetemi)  
                // optimized on Intel Core 2 Duo T7500  
                  
                mov         ecx, nBytes  
                mov         edi, dst  
                mov         esi, src  
                add         ecx, edi  
  
                prefetchnta [esi]  
                prefetchnta [esi+32]  
                prefetchnta [esi+64]  
                prefetchnta [esi+96]  
  
                // handle nBytes lower than 128  
                cmp         nBytes, 512  
                jge         fast  
slow:  
                mov         bl, [esi]  
                mov         [edi], bl  
                inc         edi  
                inc         esi  
                cmp         ecx, edi  
                jnz         slow  
                jmp         end  
  
fast:  
                // align dstEnd to 128 bytes  
                and         ecx, 0xFFFFFF80  
  
                // get srcEnd aligned to dstEnd aligned to 128 bytes  
                mov         ebx, esi  
                sub         ebx, edi  
                add         ebx, ecx  
                  
                // skip unaligned copy if dst is aligned  
                mov         eax, edi  
                and         edi, 0xFFFFFF80  
                cmp         eax, edi  
                jne         first  
                jmp         more  
  
first:  
                // copy the first 128 bytes unaligned  
                movdqu      xmm0, [esi]  
                movdqu      xmm1, [esi+16]  
                movdqu      xmm2, [esi+32]  
                movdqu      xmm3, [esi+48]  
                  
                movdqu      xmm4, [esi+64]  
                movdqu      xmm5, [esi+80]  
                movdqu      xmm6, [esi+96]  
                movdqu      xmm7, [esi+112]  
                  
                movdqu      [eax], xmm0  
                movdqu      [eax+16], xmm1  
                movdqu      [eax+32], xmm2  
                movdqu      [eax+48], xmm3  
                  
                movdqu      [eax+64], xmm4  
                movdqu      [eax+80], xmm5  
                movdqu      [eax+96], xmm6  
                movdqu      [eax+112], xmm7  
                  
                // add 128 bytes to edi aligned earlier  
                add         edi, 128  
                  
                // offset esi by the same value  
                sub         eax, edi  
                sub         esi, eax  
                  
                // last bytes if dst at dstEnd  
                cmp         ecx, edi  
                jnz         more  
                jmp         last  
                  
more:  
                // handle equally aligned arrays  
                mov         eax, esi  
                and         eax, 0xFFFFFF80  
                cmp         eax, esi  
                jne         unaligned4k  
                  
aligned4k:  
                mov         eax, esi  
                add         eax, 4096  
                cmp         eax, ebx  
                jle         aligned4kin  
                cmp         ecx, edi  
                jne         alignedlast  
                jmp         last  
                  
aligned4kin:  
                prefetchnta [esi]  
                prefetchnta [esi+32]  
                prefetchnta [esi+64]  
                prefetchnta [esi+96]  
                  
                add         esi, 128  
                  
                cmp         eax, esi  
                jne         aligned4kin  
  
                sub         esi, 4096  
  
alinged4kout:  
                movdqa      xmm0, [esi]  
                movdqa      xmm1, [esi+16]  
                movdqa      xmm2, [esi+32]  
                movdqa      xmm3, [esi+48]  
                  
                movdqa      xmm4, [esi+64]  
                movdqa      xmm5, [esi+80]  
                movdqa      xmm6, [esi+96]  
                movdqa      xmm7, [esi+112]  
                  
                movntdq     [edi], xmm0  
                movntdq     [edi+16], xmm1  
                movntdq     [edi+32], xmm2  
                movntdq     [edi+48], xmm3  
                  
                movntdq     [edi+64], xmm4  
                movntdq     [edi+80], xmm5  
                movntdq     [edi+96], xmm6  
                movntdq     [edi+112], xmm7  
                  
                add         esi, 128  
                add         edi, 128  
                  
                cmp         eax, esi  
                jne         alinged4kout  
                jmp         aligned4k  
  
alignedlast:  
                mov         eax, esi  
  
alignedlastin:  
                prefetchnta [esi]  
                prefetchnta [esi+32]  
                prefetchnta [esi+64]  
                prefetchnta [esi+96]  
                  
                add         esi, 128  
                  
                cmp         ebx, esi  
                jne         alignedlastin  
                  
                mov         esi, eax  
  
alignedlastout:  
                movdqa      xmm0, [esi]  
                movdqa      xmm1, [esi+16]  
                movdqa      xmm2, [esi+32]  
                movdqa      xmm3, [esi+48]  
                  
                movdqa      xmm4, [esi+64]  
                movdqa      xmm5, [esi+80]  
                movdqa      xmm6, [esi+96]  
                movdqa      xmm7, [esi+112]  
                  
                movntdq     [edi], xmm0  
                movntdq     [edi+16], xmm1  
                movntdq     [edi+32], xmm2  
                movntdq     [edi+48], xmm3  
                  
                movntdq     [edi+64], xmm4  
                movntdq     [edi+80], xmm5  
                movntdq     [edi+96], xmm6  
                movntdq     [edi+112], xmm7  
                  
                add         esi, 128  
                add         edi, 128  
                  
                cmp         ecx, edi  
                jne         alignedlastout  
                jmp         last  
  
unaligned4k:  
                mov         eax, esi  
                add         eax, 4096  
                cmp         eax, ebx  
                jle         unaligned4kin  
                cmp         ecx, edi  
                jne         unalignedlast  
                jmp         last  
  
unaligned4kin:  
                prefetchnta [esi]  
                prefetchnta [esi+32]  
                prefetchnta [esi+64]  
                prefetchnta [esi+96]  
                  
                add         esi, 128  
                  
                cmp         eax, esi  
                jne         unaligned4kin  
  
                sub         esi, 4096  
  
unalinged4kout:  
                movdqu      xmm0, [esi]  
                movdqu      xmm1, [esi+16]  
                movdqu      xmm2, [esi+32]  
                movdqu      xmm3, [esi+48]  
                  
                movdqu      xmm4, [esi+64]  
                movdqu      xmm5, [esi+80]  
                movdqu      xmm6, [esi+96]  
                movdqu      xmm7, [esi+112]  
                  
                movntdq     [edi], xmm0  
                movntdq     [edi+16], xmm1  
                movntdq     [edi+32], xmm2  
                movntdq     [edi+48], xmm3  
                  
                movntdq     [edi+64], xmm4  
                movntdq     [edi+80], xmm5  
                movntdq     [edi+96], xmm6  
                movntdq     [edi+112], xmm7  
                  
                add         esi, 128  
                add         edi, 128  
                  
                cmp         eax, esi  
                jne         unalinged4kout  
                jmp         unaligned4k  
  
unalignedlast:  
                mov         eax, esi  
  
unalignedlastin:  
                prefetchnta [esi]  
                prefetchnta [esi+32]  
                prefetchnta [esi+64]  
                prefetchnta [esi+96]  
                  
                add         esi, 128  
                  
                cmp         ebx, esi  
                jne         unalignedlastin  
                  
                mov         esi, eax  
  
unalignedlastout:  
                movdqu      xmm0, [esi]  
                movdqu      xmm1, [esi+16]  
                movdqu      xmm2, [esi+32]  
                movdqu      xmm3, [esi+48]  
                  
                movdqu      xmm4, [esi+64]  
                movdqu      xmm5, [esi+80]  
                movdqu      xmm6, [esi+96]  
                movdqu      xmm7, [esi+112]  
                  
                movntdq     [edi], xmm0  
                movntdq     [edi+16], xmm1  
                movntdq     [edi+32], xmm2  
                movntdq     [edi+48], xmm3  
                  
                movntdq     [edi+64], xmm4  
                movntdq     [edi+80], xmm5  
                movntdq     [edi+96], xmm6  
                movntdq     [edi+112], xmm7  
                  
                add         esi, 128  
                add         edi, 128  
                  
                cmp         ecx, edi  
                jne         unalignedlastout  
                jmp         last  
                  
last:  
                // get the last 128 bytes  
                mov         ecx, nBytes  
                mov         edi, dst  
                mov         esi, src  
                add         edi, ecx  
                add         esi, ecx  
                sub         edi, 128  
                sub         esi, 128  
  
                // copy the last 128 bytes unaligned  
                movdqu      xmm0, [esi]  
                movdqu      xmm1, [esi+16]  
                movdqu      xmm2, [esi+32]  
                movdqu      xmm3, [esi+48]  
                  
                movdqu      xmm4, [esi+64]  
                movdqu      xmm5, [esi+80]  
                movdqu      xmm6, [esi+96]  
                movdqu      xmm7, [esi+112]  
                  
                movdqu      [edi], xmm0  
                movdqu      [edi+16], xmm1  
                movdqu      [edi+32], xmm2  
                movdqu      [edi+48], xmm3  
                  
                movdqu      [edi+64], xmm4  
                movdqu      [edi+80], xmm5  
                movdqu      [edi+96], xmm6  
                movdqu      [edi+112], xmm7  
  
end:  
        }  
        return dst;  
}  
0 0