部分GNU代码片 19、快速的memcpy

来源：互联网发布：java怎么做空指针错误编辑：程序博客网时间：2024/06/05 19:01

#include <stdio.h>

#include <stdlib.h>

/* for small memory blocks (<256 bytes) this version is faster */

#define small_memcpy(to,from,n)

{

__asm__ __volatile__(

"rep; movsb"

:"=&D"(to), "=&S"(from), "=&c"(dummy)

:"0" (to), "1" (from),"2" (n)

: "memory");

}

/* linux kernel __memcpy (from: /include/asm/string.h) */

static inline void * __memcpy(void * to, const void * from, size_t n)

{

int d0, d1, d2;

if ( n < 4 ) {

small_memcpy(to,from,n);

}

else

__asm__ __volatile__(

"rep ; movsl "

"testb $2,%b4 "

"je 1f "

"movsw "

"1: testb $1,%b4 "

"je 2f "

"movsb "

"2:"

: "=&c" (d0), "=&D" (d1), "=&S" (d2)

:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)

: "memory");

return(to);

}

#ifdef HAVE_3DNOW

#define EMMS "femms"

#else

#define EMMS "emms"

#endif

#ifdef HAVE_MMX2

#define PREFETCH "prefetchnta"

#elif defined ( HAVE_3DNOW )

#define PREFETCH "prefetch"

#else

#define PREFETCH "/nop"

#endif

#undef MOVNTQ

#ifdef HAVE_MMX2

#define MOVNTQ "movntq"

#else

#define MOVNTQ "movq"

#endif

#undef MIN_LEN

#ifdef HAVE_MMX1

#define MIN_LEN 0x800 /* 2K blocks */

#else

#define MIN_LEN 0x40 /* 64-byte blocks */

#endif

static void * big_memcpy(void *to, const void *from , size_t len) {

void *retval;

size_t i;

retval = to;

if(len >= MIN_LEN)

{

/* Align destinition to MMREG_SIZE -boundary */

delta = ((unsigned long int)to)&7;

if(delta)

{

delta=8-delta;

len -= delta;

small_memcpy(to, from, delta);

}

i = len >> 6; /* len/64 */

len &= 63;

This algorithm is top effective when the code consequently

reads and writes blocks which have size of cache line.

Size of cache line is processor-dependent.

It will, however, be a minimum of 32 bytes on any processors.

It would be better to have a number of instructions which

perform reading and writing to be multiple to a number of

processor's decoders, but it's not always possible.

for(; i>0; i--)

{

__asm__ __volatile__ (

PREFETCH" 320(%0) "

"movq (%0), %%mm0 "

"movq 8(%0), %%mm1 "

"movq 16(%0), %%mm2 "

"movq 24(%0), %%mm3 "

"movq 32(%0), %%mm4 "

"movq 40(%0), %%mm5 "

"movq 48(%0), %%mm6 "

"movq 56(%0), %%mm7 "

MOVNTQ" %%mm0, (%1) "

MOVNTQ" %%mm1, 8(%1) "

MOVNTQ" %%mm2, 16(%1) "

MOVNTQ" %%mm3, 24(%1) "

MOVNTQ" %%mm4, 32(%1) "

MOVNTQ" %%mm5, 40(%1) "

MOVNTQ" %%mm6, 48(%1) "

MOVNTQ" %%mm7, 56(%1) "

:: "r" (from), "r" (to) : "memory");

((const unsigned char *)from)+=64;

((unsigned char *)to)+=64;

}

#ifdef HAVE_MMX2

/* since movntq is weakly-ordered, a "sfence"

* is needed to become ordered again. */

__asm__ __volatile__ ("sfence":::"memory");

#endif

/* enables to use FPU */

__asm__ __volatile__ (EMMS:::"memory");

}

* Now do the tail of the block

if(len) small_memcpy(to, from, len);

return retval;

}

char src[1270];

char dest[1270];

void init_src()

{

int i;

for(i = 0; i < 1260; i++)

src[i] = i+1;

}

void output(int beg, int end)

{

int i;

for(i = beg-1; i < end; i++)

printf(" %d, ", dest[i]);

printf(" ");

}

void test_last16()

{

// __memcpy(dest, src, 15);

big_memcpy(dest, src, 1100);

output(1, 1100);

}

int main()

{

init_src();

test_last16();

return 0;

}