部分GNU代码片 19、快速的memcpy
来源:互联网 发布:java怎么做空指针错误 编辑:程序博客网 时间:2024/06/05 19:01
#include <stdio.h>
#include <stdlib.h>
/**//* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)
...{
register unsigned long int dummy;
__asm__ __volatile__(
"rep; movsb"
:"=&D"(to), "=&S"(from), "=&c"(dummy)
:"0" (to), "1" (from),"2" (n)
: "memory");
}
/**//* linux kernel __memcpy (from: /include/asm/string.h) */
static inline void * __memcpy(void * to, const void * from, size_t n)
...{
int d0, d1, d2;
if ( n < 4 ) ...{
small_memcpy(to,from,n);
}
else
__asm__ __volatile__(
"rep ; movsl "
"testb $2,%b4 "
"je 1f "
"movsw "
"1: testb $1,%b4 "
"je 2f "
"movsb "
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return(to);
}
#ifdef HAVE_3DNOW
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define PREFETCH "prefetchnta"
#elif defined ( HAVE_3DNOW )
#define PREFETCH "prefetch"
#else
#define PREFETCH "/nop"
#endif
#undef MOVNTQ
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#else
#define MOVNTQ "movq"
#endif
#undef MIN_LEN
#ifdef HAVE_MMX1
#define MIN_LEN 0x800 /* 2K blocks */
#else
#define MIN_LEN 0x40 /* 64-byte blocks */
#endif
static void * big_memcpy(void *to, const void *from , size_t len) ...{
void *retval;
size_t i;
retval = to;
if(len >= MIN_LEN)
...{
register unsigned long int delta;
/**//* Align destinition to MMREG_SIZE -boundary */
delta = ((unsigned long int)to)&7;
if(delta)
...{
delta=8-delta;
len -= delta;
small_memcpy(to, from, delta);
}
i = len >> 6; /**//* len/64 */
len &= 63;
/**//*
This algorithm is top effective when the code consequently
reads and writes blocks which have size of cache line.
Size of cache line is processor-dependent.
It will, however, be a minimum of 32 bytes on any processors.
It would be better to have a number of instructions which
perform reading and writing to be multiple to a number of
processor's decoders, but it's not always possible.
*/
for(; i>0; i--)
...{
__asm__ __volatile__ (
PREFETCH" 320(%0) "
"movq (%0), %%mm0 "
"movq 8(%0), %%mm1 "
"movq 16(%0), %%mm2 "
"movq 24(%0), %%mm3 "
"movq 32(%0), %%mm4 "
"movq 40(%0), %%mm5 "
"movq 48(%0), %%mm6 "
"movq 56(%0), %%mm7 "
MOVNTQ" %%mm0, (%1) "
MOVNTQ" %%mm1, 8(%1) "
MOVNTQ" %%mm2, 16(%1) "
MOVNTQ" %%mm3, 24(%1) "
MOVNTQ" %%mm4, 32(%1) "
MOVNTQ" %%mm5, 40(%1) "
MOVNTQ" %%mm6, 48(%1) "
MOVNTQ" %%mm7, 56(%1) "
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#ifdef HAVE_MMX2
/**//* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
/**//* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
}
/**//*
* Now do the tail of the block
*/
if(len) small_memcpy(to, from, len);
return retval;
}
char src[1270];
char dest[1270];
void init_src()
...{
int i;
for(i = 0; i < 1260; i++)
src[i] = i+1;
}
void output(int beg, int end)
...{
int i;
for(i = beg-1; i < end; i++)
printf(" %d, ", dest[i]);
printf(" ");
}
void test_last16()
...{
// __memcpy(dest, src, 15);
big_memcpy(dest, src, 1100);
output(1, 1100);
}
int main()
...{
init_src();
test_last16();
return 0;
}
#include <stdlib.h>
/**//* for small memory blocks (<256 bytes) this version is faster */
#define small_memcpy(to,from,n)
...{
register unsigned long int dummy;
__asm__ __volatile__(
"rep; movsb"
:"=&D"(to), "=&S"(from), "=&c"(dummy)
:"0" (to), "1" (from),"2" (n)
: "memory");
}
/**//* linux kernel __memcpy (from: /include/asm/string.h) */
static inline void * __memcpy(void * to, const void * from, size_t n)
...{
int d0, d1, d2;
if ( n < 4 ) ...{
small_memcpy(to,from,n);
}
else
__asm__ __volatile__(
"rep ; movsl "
"testb $2,%b4 "
"je 1f "
"movsw "
"1: testb $1,%b4 "
"je 2f "
"movsb "
"2:"
: "=&c" (d0), "=&D" (d1), "=&S" (d2)
:"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)
: "memory");
return(to);
}
#ifdef HAVE_3DNOW
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
#ifdef HAVE_MMX2
#define PREFETCH "prefetchnta"
#elif defined ( HAVE_3DNOW )
#define PREFETCH "prefetch"
#else
#define PREFETCH "/nop"
#endif
#undef MOVNTQ
#ifdef HAVE_MMX2
#define MOVNTQ "movntq"
#else
#define MOVNTQ "movq"
#endif
#undef MIN_LEN
#ifdef HAVE_MMX1
#define MIN_LEN 0x800 /* 2K blocks */
#else
#define MIN_LEN 0x40 /* 64-byte blocks */
#endif
static void * big_memcpy(void *to, const void *from , size_t len) ...{
void *retval;
size_t i;
retval = to;
if(len >= MIN_LEN)
...{
register unsigned long int delta;
/**//* Align destinition to MMREG_SIZE -boundary */
delta = ((unsigned long int)to)&7;
if(delta)
...{
delta=8-delta;
len -= delta;
small_memcpy(to, from, delta);
}
i = len >> 6; /**//* len/64 */
len &= 63;
/**//*
This algorithm is top effective when the code consequently
reads and writes blocks which have size of cache line.
Size of cache line is processor-dependent.
It will, however, be a minimum of 32 bytes on any processors.
It would be better to have a number of instructions which
perform reading and writing to be multiple to a number of
processor's decoders, but it's not always possible.
*/
for(; i>0; i--)
...{
__asm__ __volatile__ (
PREFETCH" 320(%0) "
"movq (%0), %%mm0 "
"movq 8(%0), %%mm1 "
"movq 16(%0), %%mm2 "
"movq 24(%0), %%mm3 "
"movq 32(%0), %%mm4 "
"movq 40(%0), %%mm5 "
"movq 48(%0), %%mm6 "
"movq 56(%0), %%mm7 "
MOVNTQ" %%mm0, (%1) "
MOVNTQ" %%mm1, 8(%1) "
MOVNTQ" %%mm2, 16(%1) "
MOVNTQ" %%mm3, 24(%1) "
MOVNTQ" %%mm4, 32(%1) "
MOVNTQ" %%mm5, 40(%1) "
MOVNTQ" %%mm6, 48(%1) "
MOVNTQ" %%mm7, 56(%1) "
:: "r" (from), "r" (to) : "memory");
((const unsigned char *)from)+=64;
((unsigned char *)to)+=64;
}
#ifdef HAVE_MMX2
/**//* since movntq is weakly-ordered, a "sfence"
* is needed to become ordered again. */
__asm__ __volatile__ ("sfence":::"memory");
#endif
/**//* enables to use FPU */
__asm__ __volatile__ (EMMS:::"memory");
}
/**//*
* Now do the tail of the block
*/
if(len) small_memcpy(to, from, len);
return retval;
}
char src[1270];
char dest[1270];
void init_src()
...{
int i;
for(i = 0; i < 1260; i++)
src[i] = i+1;
}
void output(int beg, int end)
...{
int i;
for(i = beg-1; i < end; i++)
printf(" %d, ", dest[i]);
printf(" ");
}
void test_last16()
...{
// __memcpy(dest, src, 15);
big_memcpy(dest, src, 1100);
output(1, 1100);
}
int main()
...{
init_src();
test_last16();
return 0;
}
- 部分GNU代码片 19、快速的memcpy
- 部分GNU代码片 16、GDB调试多线程的方法
- 部分GNU代码片 17、GDB调试so的方法
- 部分GNU代码片 1、trim
- 部分GNU代码片 2、read_configfile
- 部分GNU代码片 6、split
- 部分GNU代码片 7、mysql接口
- 部分GNU代码片 11、合并文件
- 部分GNU代码片 18、字符串操作
- 部分GNU代码片 8、程序的配置文件解析部分辨别代码
- 部分GNU代码片 5、两个time_t之间的差(天、周、月)
- 部分GNU代码片 14、获取文件长度的两种方法 fopen open
- 部分GNU代码片 15、GDB调试多进程的方法
- 部分GNU代码片 3、字符串2二维数组
- 部分GNU代码片 4、折半查找算法
- 部分GNU代码片 9、当前时间 精确到us
- 部分GNU代码片 10、二叉堆(Binary Heap)
- 部分GNU代码片 12、给出年月日求天数
- 【转自TICORE】【HOW TO 系列】讓 Flex 內不可選擇的文字超連結生效
- ViewPdf function
- Java编程那些事儿16——代码框架、关键字和标识符
- Skype推出不限时国际长话服务
- 野餐
- 部分GNU代码片 19、快速的memcpy
- 转换运算符 implicit,explicit,operator
- 【转自KINGNARE】在自己的组件中应用PhotoShop Express的黑色皮肤
- 权证操作技巧1
- SkyEye安装与使用
- 权证操作技巧2
- 山腰中的AMD,向前是唯一的选择
- 相册发图片太慢
- Spring整合Struts