00001 /*00002 (c) Copyright 2000-2002 convergence integrated media GmbH.00003 (c) Copyright 2002 convergence GmbH.00004 00005 All rights reserved.00006 00007 Written by Denis Oliver Kropp <dok@directfb.org>,00008 Andreas Hundt <andi@fischlustig.de> and00009 Sven Neumann <sven@convergence.de>.00010 Silvano Galliani aka kysucix <kysucix@dyne.org> sse2 version00011 00012 Fast memcpy code was taken from xine (see below).00013 00014 This library is free software; you can redistribute it and/or00015 modify it under the terms of the GNU Lesser General Public00016 License as published by the Free Software Foundation; either00017 version 2 of the License, or (at your option) any later version.00018 00019 This library is distributed in the hope that it will be useful,00020 but WITHOUT ANY WARRANTY; without even the implied warranty of00021 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU00022 Lesser General Public License for more details.00023 00024 You should have received a copy of the GNU Lesser General Public00025 License along with this library; if not, write to the00026 Free Software Foundation, Inc., 59 Temple Place - Suite 330,00027 Boston, MA 02111-1307, USA.00028 */00029 00030 /*00031 * Copyright (C) 2001 the xine project00032 *00033 * This file is part of xine, a unix video player.00034 *00035 * xine is free software; you can redistribute it and/or modify00036 * it under the terms of the GNU General Public License as published by00037 * the Free Software Foundation; either version 2 of the License, or00038 * (at your option) any later version.00039 *00040 * xine is distributed in the hope that it will be useful,00041 * but WITHOUT ANY WARRANTY; without even the implied warranty of00042 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the00043 * GNU General Public License for more details.00044 *00045 * You should have received a copy of the GNU General Public License00046 * along with this program; if not, write to the Free Software00047 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA00048 *00049 * These are the MMX/MMX2/SSE optimized versions of memcpy00050 *00051 * This code was adapted from Linux Kernel sources by Nick Kurshev to00052 * the mplayer program. (http://mplayer.sourceforge.net)00053 *00054 * Miguel Freitas split the #ifdefs into several specialized functions that00055 * are benchmarked at runtime by xine. Some original comments from Nick00056 * have been preserved documenting some MMX/SSE oddities.00057 * Also added kernel memcpy function that seems faster than glibc one.00058 *00059 */00060 00061 /* Original comments from mplayer (file: aclib.c) This part of code00062 was taken by me from Linux-2.4.3 and slightly modified for MMX, MMX2,00063 SSE instruction set. I have done it since linux uses page aligned00064 blocks but mplayer uses weakly ordered data and original sources can00065 not speedup them. Only using PREFETCHNTA and MOVNTQ together have00066 effect!00067 00068 From IA-32 Intel Architecture Software Developer's Manual Volume 1,00069 00070 Order Number 245470:00071 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"00072 00073 Data referenced by a program can be temporal (data will be used00074 again) or non-temporal (data will be referenced once and not reused00075 in the immediate future). To make efficient use of the processor's00076 caches, it is generally desirable to cache temporal data and not00077 cache non-temporal data. Overloading the processor's caches with00078 non-temporal data is sometimes referred to as "polluting the00079 caches". The non-temporal data is written to memory with00080 Write-Combining semantics.00081 00082 The PREFETCHh instructions permits a program to load data into the00083 processor at a suggested cache level, so that it is closer to the00084 processors load and store unit when it is needed. If the data is00085 already present in a level of the cache hierarchy that is closer to00086 the processor, the PREFETCHh instruction will not result in any data00087 movement. But we should you PREFETCHNTA: Non-temporal data fetch00088 data into location close to the processor, minimizing cache00089 pollution.00090 00091 The MOVNTQ (store quadword using non-temporal hint) instruction00092 stores packed integer data from an MMX register to memory, using a00093 non-temporal hint. The MOVNTPS (store packed single-precision00094 floating-point values using non-temporal hint) instruction stores00095 packed floating-point data from an XMM register to memory, using a00096 non-temporal hint.00097 00098 The SFENCE (Store Fence) instruction controls write ordering by00099 creating a fence for memory store operations. This instruction00100 guarantees that the results of every store instruction that precedes00101 the store fence in program order is globally visible before any00102 store instruction that follows the fence. The SFENCE instruction00103 provides an efficient way of ensuring ordering between procedures00104 that produce weakly-ordered data and procedures that consume that00105 data.00106 00107 If you have questions please contact with me: Nick Kurshev:00108 nickols_k@mail.ru.00109 */00110 00111 /* mmx v.1 Note: Since we added alignment of destinition it speedups00112 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus00113 standard (non MMX-optimized) version.00114 Note: on K6-2+ it speedups memory copying upto 25% and00115 on K7 and P3 about 500% (5 times).00116 */00117 00118 /* Additional notes on gcc assembly and processors: [MF]00119 prefetch is specific for AMD processors, the intel ones should be00120 prefetch0, prefetch1, prefetch2 which are not recognized by my gcc.00121 prefetchnta is supported both on athlon and pentium 3.00122 00123 therefore i will take off prefetchnta instructions from the mmx100124 version to avoid problems on pentium mmx and k6-2.00125 00126 quote of the day:00127 "Using prefetches efficiently is more of an art than a science"00128 */00129 00130 #include <sys/time.h>00131 #include <time.h>00132 00133 #include <stdlib.h>00134 #include <string.h>00135 00136 #include <config.h>00137 #include <jutils.h>00138 #include <cpu_accel.h>00139 #include <fastmemcpy.h>00140 00141 00142 00143 #ifdef ARCH_X8600144 00145 /* for small memory blocks (<256 bytes) this version is faster */00146 #define small_memcpy(to,from,n)\00147 {\00148 register unsigned long int dummy;\00149 __asm__ __volatile__(\00150 "rep; movsb"\00151 :"=&D"(to), "=&S"(from), "=&c"(dummy)\00152 :"0" (to), "1" (from),"2" (n)\00153 : "memory");\00154 }00155 /* On K6 femms is faster of emms. On K7 femms is directly mapped on emms. */00156 #ifdef HAVE_3DNOW00157 #define EMMS "femms"00158 #else00159 #define EMMS "emms"00160 #endif00161 00162 #ifdef HAVE_MMX200163 #define PREFETCH "prefetchnta"00164 #elif defined ( HAVE_3DNOW )00165 #define PREFETCH "prefetch"00166 #else00167 #define PREFETCH "/nop"00168 #endif00169 00170 00171 #undef MOVNTQ00172 #ifdef HAVE_MMX200173 #define MOVNTQ "movntq"00174 #else00175 #define MOVNTQ "movq"00176 #endif00177 00178 #undef MIN_LEN00179 #ifdef HAVE_MMX100180 #define MIN_LEN 0x800 /* 2K blocks */00181 #else00182 #define MIN_LEN 0x40 /* 64-byte blocks */00183 #endif00184 00185 00186 static void * agp_memcpy(void *to, const void *from , size_t len) {00187 void *retval;00188 size_t i;00189 retval = to;00190 if(len >= MIN_LEN)00191 {00192 register unsigned long int delta;00193 /* Align destinition to MMREG_SIZE -boundary */00194 delta = ((unsigned long int)to)&7;00195 if(delta)00196 {00197 delta=8-delta;00198 len -= delta;00199 small_memcpy(to, from, delta);00200 }00201 i = len >> 6; /* len/64 */00202 len &= 63;00203 /*00204 This algorithm is top effective when the code consequently00205 reads and writes blocks which have size of cache line.00206 Size of cache line is processor-dependent.00207 It will, however, be a minimum of 32 bytes on any processors.00208 It would be better to have a number of instructions which00209 perform reading and writing to be multiple to a number of00210 processor's decoders, but it's not always possible.00211 */00212 for(; i>0; i--)00213 {00214 __asm__ __volatile__ (00215 PREFETCH" 320(%0)\n"00216 "movq (%0), %%mm0\n"00217 "movq 8(%0), %%mm1\n"00218 "movq 16(%0), %%mm2\n"00219 "movq 24(%0), %%mm3\n"00220 "movq 32(%0), %%mm4\n"00221 "movq 40(%0), %%mm5\n"00222 "movq 48(%0), %%mm6\n"00223 "movq 56(%0), %%mm7\n"00224 MOVNTQ" %%mm0, (%1)\n"00225 MOVNTQ" %%mm1, 8(%1)\n"00226 MOVNTQ" %%mm2, 16(%1)\n"00227 MOVNTQ" %%mm3, 24(%1)\n"00228 MOVNTQ" %%mm4, 32(%1)\n"00229 MOVNTQ" %%mm5, 40(%1)\n"00230 MOVNTQ" %%mm6, 48(%1)\n"00231 MOVNTQ" %%mm7, 56(%1)\n"00232 :: "r" (from), "r" (to) : "memory");00233 from = ((const unsigned char *)from)+64;00234 to = ((unsigned char *)to)+64;00235 }00236 #ifdef HAVE_MMX200237 /* since movntq is weakly-ordered, a "sfence"00238 * is needed to become ordered again. */00239 __asm__ __volatile__ ("sfence":::"memory");00240 #endif00241 /* enables to use FPU */00242 __asm__ __volatile__ (EMMS:::"memory");00243 }00244 /*00245 * Now do the tail of the block00246 */00247 if(len) small_memcpy(to, from, len);00248 return retval;00249 }00250 00251 00252 /* linux kernel __memcpy (from: /include/asm/string.h) */00253 static inline void * __memcpy(void * to, const void * from, size_t n)00254 {00255 int d0, d1, d2;00256 00257 if ( n < 4 ) {00258 small_memcpy(to,from,n);00259 }00260 else00261 __asm__ __volatile__(00262 "rep ; movsl\n\t"00263 "testb $2,%b4\n\t"00264 "je 1f\n\t"00265 "movsw\n"00266 "1:\ttestb $1,%b4\n\t"00267 "je 2f\n\t"00268 "movsb\n"00269 "2:"00270 : "=&c" (d0), "=&D" (d1), "=&S" (d2)00271 :"0" (n/4), "q" (n),"1" ((long) to),"2" ((long) from)00272 : "memory");00273 00274 return(to);00275 }00276 00277 #ifdef HAVE_MMX00278 00279 #define MMX_MMREG_SIZE 800280 00281 #define MMX1_MIN_LEN 0x800 /* 2K blocks */00282 #define MIN_LEN 0x40 /* 64-byte blocks */00283 00284 static void * mmx_memcpy(void * to, const void * from, size_t len)00285 {00286 void *retval;00287 size_t i;00288 retval = to;00289 00290 if (len >= MMX1_MIN_LEN) {00291 register unsigned long int delta;00292 /* Align destinition to MMREG_SIZE -boundary */00293 delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);00294 if (delta) {00295 delta=MMX_MMREG_SIZE-delta;00296 len -= delta;00297 small_memcpy(to, from, delta);00298 }00299 i = len >> 6; /* len/64 */00300 len&=63;00301 for (; i>0; i--) {00302 __asm__ __volatile__ (00303 "movq (%0), %%mm0\n"00304 "movq 8(%0), %%mm1\n"00305 "movq 16(%0), %%mm2\n"00306 "movq 24(%0), %%mm3\n"00307 "movq 32(%0), %%mm4\n"00308 "movq 40(%0), %%mm5\n"00309 "movq 48(%0), %%mm6\n"00310 "movq 56(%0), %%mm7\n"00311 "movq %%mm0, (%1)\n"00312 "movq %%mm1, 8(%1)\n"00313 "movq %%mm2, 16(%1)\n"00314 "movq %%mm3, 24(%1)\n"00315 "movq %%mm4, 32(%1)\n"00316 "movq %%mm5, 40(%1)\n"00317 "movq %%mm6, 48(%1)\n"00318 "movq %%mm7, 56(%1)\n"00319 :: "r" (from), "r" (to) : "memory");00320 from = ((const unsigned char *)from)+64;00321 to = ((unsigned char *)to)+64;00322 }00323 __asm__ __volatile__ ("emms":::"memory");00324 }00325 /*00326 * Now do the tail of the block00327 */00328 if (len) __memcpy(to, from, len);00329 return retval;00330 }00331 00332 /* we might want to write optimized versions of these later */00333 #define __constant_count_memset(s,c,count) __memset_generic((s),(c),(count))00334 00335 /*00336 * memset(x,0,y) is a reasonably common thing to do, so we want to fill00337 * things 32 bits at a time even when we don't know the size of the00338 * area at compile-time..00339 */00340 void mymemzero(void * s, unsigned long c ,size_t count)00341 {00342 int d0, d1;00343 __asm__ __volatile__(00344 "rep ; stosl\n\t"00345 "testb $2,%b3\n\t"00346 "je 1f\n\t"00347 "stosw\n"00348 "1:\ttestb $1,%b3\n\t"00349 "je 2f\n\t"00350 "stosb\n"00351 "2:"00352 : "=&c" (d0), "=&D" (d1)00353 :"a" (c), "q" (count), "0" (count/4), "1" ((long) s)00354 :"memory");00355 }00356 00357 #ifdef HAVE_SSE00358 00359 #define SSE_MMREG_SIZE 1600360 00361 static void * mmx2_memcpy(void * to, const void * from, size_t len)00362 {00363 void *retval;00364 size_t i;00365 retval = to;00366 00367 /* PREFETCH has effect even for MOVSB instruction ;) */00368 __asm__ __volatile__ (00369 " prefetchnta (%0)\n"00370 " prefetchnta 64(%0)\n"00371 " prefetchnta 128(%0)\n"00372 " prefetchnta 192(%0)\n"00373 " prefetchnta 256(%0)\n"00374 : : "r" (from) );00375 00376 if (len >= MIN_LEN) {00377 register unsigned long int delta;00378 /* Align destinition to MMREG_SIZE -boundary */00379 delta = ((unsigned long int)to)&(MMX_MMREG_SIZE-1);00380 if (delta) {00381 delta=MMX_MMREG_SIZE-delta;00382 len -= delta;00383 small_memcpy(to, from, delta);00384 }00385 i = len >> 6; /* len/64 */00386 len&=63;00387 for (; i>0; i--) {00388 __asm__ __volatile__ (00389 "prefetchnta 320(%0)\n"00390 "movq (%0), %%mm0\n"00391 "movq 8(%0), %%mm1\n"00392 "movq 16(%0), %%mm2\n"00393 "movq 24(%0), %%mm3\n"00394 "movq 32(%0), %%mm4\n"00395 "movq 40(%0), %%mm5\n"00396 "movq 48(%0), %%mm6\n"00397 "movq 56(%0), %%mm7\n"00398 "movntq %%mm0, (%1)\n"00399 "movntq %%mm1, 8(%1)\n"00400 "movntq %%mm2, 16(%1)\n"00401 "movntq %%mm3, 24(%1)\n"00402 "movntq %%mm4, 32(%1)\n"00403 "movntq %%mm5, 40(%1)\n"00404 "movntq %%mm6, 48(%1)\n"00405 "movntq %%mm7, 56(%1)\n"00406 :: "r" (from), "r" (to) : "memory");00407 from = ((const unsigned char *)from)+64;00408 to = ((unsigned char *)to)+64;00409 }00410 /* since movntq is weakly-ordered, a "sfence"00411 * is needed to become ordered again. */00412 __asm__ __volatile__ ("sfence":::"memory");00413 __asm__ __volatile__ ("emms":::"memory");00414 }00415 /*00416 * Now do the tail of the block00417 */00418 if (len) __memcpy(to, from, len);00419 return retval;00420 }00421 00422 /* SSE note: i tried to move 128 bytes a time instead of 64 but it00423 didn't make any measureable difference. i'm using 64 for the sake of00424 simplicity. [MF] */00425 static void * sse_memcpy(void * to, const void * from, size_t len)00426 {00427 void *retval;00428 size_t i;00429 retval = to;00430 00431 /* PREFETCH has effect even for MOVSB instruction ;) */00432 __asm__ __volatile__ (00433 " prefetchnta (%0)\n"00434 " prefetchnta 64(%0)\n"00435 " prefetchnta 128(%0)\n"00436 " prefetchnta 192(%0)\n"00437 " prefetchnta 256(%0)\n"00438 : : "r" (from) );00439 00440 if (len >= MIN_LEN) {00441 register unsigned long int delta;00442 /* Align destinition to MMREG_SIZE -boundary */00443 delta = ((unsigned long int)to)&(SSE_MMREG_SIZE-1);00444 if (delta) {00445 delta=SSE_MMREG_SIZE-delta;00446 len -= delta;00447 small_memcpy(to, from, delta);00448 }00449 i = len >> 6; /* len/64 */00450 len&=63;00451 if (((unsigned long)from) & 15)00452 /* if SRC is misaligned */00453 for (; i>0; i--) {00454 __asm__ __volatile__ (00455 "prefetchnta 320(%0)\n"00456 "movups (%0), %%xmm0\n"00457 "movups 16(%0), %%xmm1\n"00458 "movups 32(%0), %%xmm2\n"00459 "movups 48(%0), %%xmm3\n"00460 "movntps %%xmm0, (%1)\n"00461 "movntps %%xmm1, 16(%1)\n"00462 "movntps %%xmm2, 32(%1)\n"00463 "movntps %%xmm3, 48(%1)\n"00464 :: "r" (from), "r" (to) : "memory");00465 from = ((const unsigned char *)from)+64;00466 to = ((unsigned char *)to)+64;00467 }00468 else00469 /*00470 Only if SRC is aligned on 16-byte boundary.00471 It allows to use movaps instead of movups, which required00472 data to be aligned or a general-protection exception (#GP)00473 is generated.00474 */00475 for (; i>0; i--) {00476 __asm__ __volatile__ (00477 "prefetchnta 320(%0)\n"00478 "movaps (%0), %%xmm0\n"00479 "movaps 16(%0), %%xmm1\n"00480 "movaps 32(%0), %%xmm2\n"00481 "movaps 48(%0), %%xmm3\n"00482 "movntps %%xmm0, (%1)\n"00483 "movntps %%xmm1, 16(%1)\n"00484 "movntps %%xmm2, 32(%1)\n"00485 "movntps %%xmm3, 48(%1)\n"00486 :: "r" (from), "r" (to) : "memory");00487 from = ((const unsigned char *)from)+64;00488 to = ((unsigned char *)to)+64;00489 }00490 /* since movntq is weakly-ordered, a "sfence"00491 * is needed to become ordered again. */00492 __asm__ __volatile__ ("sfence":::"memory");00493 /* enables to use FPU */00494 __asm__ __volatile__ ("emms":::"memory");00495 }00496 /*00497 * Now do the tail of the block00498 */00499 if (len) __memcpy(to, from, len);00500 return retval;00501 }00502 00503 static void * sse2_memcpy(void * to, const void * from, size_t len)00504 {00505 void *retval;00506 size_t i;00507 retval = to;00508 00509 /* PREFETCH has effect even for MOVSB instruction ;) */00510 /* Is that useful ? kysucix */00511 00512 __asm__ __volatile__ (00513 " prefetchnta (%0)\n"00514 " prefetchnta 64(%0)\n"00515 " prefetchnta 128(%0)\n"00516 " prefetchnta 192(%0)\n"00517 " prefetchnta 256(%0)\n"00518 /*00519 " prefetchnta 320(%0)\n"00520 " prefetchnta 384(%0)\n"00521 " prefetchnta 448(%0)\n"00522 " prefetchnta 512(%0)\n"00523 */00524 : : "r" (from) );00525 00526 if (len >= MIN_LEN) {00527 register unsigned long int delta;00528 /* Align destinition to MMREG_SIZE -boundary */00529 delta = ((unsigned long int)to)&(SSE_MMREG_SIZE-1);00530 if (delta) {00531 delta=SSE_MMREG_SIZE-delta;00532 len -= delta;00533 small_memcpy(to, from, delta);00534 }00535 i = len >> 7; /* len/128 */00536 len&=127;00537 if (((unsigned long)from) & 15)00538 /* if SRC is misaligned */00539 for (; i>0; i--) {00540 __asm__ __volatile__ (00541 "prefetchnta 640(%0)\n"00542 00543 "movdqu (%0), %%xmm0\n"00544 "movdqu 16(%0), %%xmm1\n"00545 "movdqu 32(%0), %%xmm2\n"00546 "movdqu 48(%0), %%xmm3\n"00547 00548 "movntdq %%xmm0, (%1)\n"00549 "movntdq %%xmm1, 16(%1)\n"00550 "movntdq %%xmm2, 32(%1)\n"00551 "movntdq %%xmm3, 48(%1)\n"00552 00553 "movdqu 64(%0), %%xmm4\n"00554 "movdqu 80(%0), %%xmm5\n"00555 "movdqu 96(%0), %%xmm6\n"00556 "movdqu 112(%0), %%xmm7\n"00557 00558 "movntdq %%xmm4, 64(%1)\n"00559 "movntdq %%xmm5, 80(%1)\n"00560 "movntdq %%xmm6, 96(%1)\n"00561 "movntdq %%xmm7, 112(%1)\n"00562 :: "r" (from), "r" (to) : "memory");00563 from = ((const unsigned char *)from)+128;00564 to = ((unsigned char *)to)+128;00565 }00566 else00567 /*00568 Only if SRC is aligned on 16-byte boundary.00569 It allows to use movaps instead of movups, which required00570 data to be aligned or a general-protection exception (#GP)00571 is generated.00572 */00573 for (; i>0; i--) {00574 __asm__ __volatile__ (00575 "prefetchnta 640(%0)\n"00576 00577 "movapd (%0), %%xmm0\n"00578 "movapd 16(%0), %%xmm1\n"00579 "movapd 32(%0), %%xmm2\n"00580 "movapd 48(%0), %%xmm3\n"00581 00582 "movntdq %%xmm0, (%1)\n"00583 "movntdq %%xmm1, 16(%1)\n"00584 "movntdq %%xmm2, 32(%1)\n"00585 "movntdq %%xmm3, 48(%1)\n"00586 00587 "movapd 64(%0), %%xmm4\n"00588 "movapd 80(%0), %%xmm5\n"00589 "movapd 96(%0), %%xmm6\n"00590 "movapd 112(%0), %%xmm7\n"00591 00592 "movntdq %%xmm4, 64(%1)\n"00593 "movntdq %%xmm5, 80(%1)\n"00594 "movntdq %%xmm6, 96(%1)\n"00595 "movntdq %%xmm7, 112(%1)\n"00596 :: "r" (from), "r" (to) : "memory");00597 from = ((const unsigned char *)from)+128;00598 to = ((unsigned char *)to)+128;00599 }00600 /* since movntq is weakly-ordered, a "sfence"00601 * is needed to become ordered again. */00602 __asm__ __volatile__ ("mfence":::"memory");00603 /* enables to use FPU */00604 __asm__ __volatile__ ("emms":::"memory");00605 }00606 /*00607 * Now do the tail of the block00608 */00609 if (len) __memcpy(to, from, len);00610 return retval;00611 }00612 #endif /* USE_SSE */00613 #endif /* USE_MMX */00614 00615 00616 static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {00617 return __memcpy(to,from,len);00618 }00619 00620 #endif /* ARCH_X86 */00621 00622 /* save library size on platforms without special memcpy impl. */00623 00624 static struct {00625 const char *name;00626 void *(*function)(void *to, const void *from, size_t len);00627 unsigned long long time;00628 __u32 cpu_require;00629 } memcpy_method[] =00630 {00631 { NULL, NULL, 0, 0},00632 { "glibc memcpy()", memcpy, 0, 0},00633 #ifdef ARCH_X8600634 { "linux kernel memcpy()", linux_kernel_memcpy, 0, 0},00635 { "agp optimized memcpy()", agp_memcpy,0,0},00636 #ifdef HAVE_MMX00637 { "MMX optimized memcpy()", mmx_memcpy, 0, MM_MMX},00638 #ifdef HAVE_SSE00639 { "MMXEXT optimized memcpy()", mmx2_memcpy, 0, MM_MMXEXT},00640 { "SSE optimized memcpy()", sse_memcpy, 0, MM_MMXEXT|MM_SSE},00641 #ifdef HAVE_SSE200642 { "SSE2 optimized memcpy()", sse2_memcpy, 0, MM_MMXEXT|MM_SSE|MM_SSE2},00643 #endif /* USE_SSE2 */00644 #endif /* USE_SSE */00645 #endif /* USE_MMX */00646 #endif /* ARCH_X86 */00647 { NULL, NULL, 0, 0},00648 };00649 00650 00651 #ifdef ARCH_X8600652 static inline unsigned long long int rdtsc()00653 {00654 unsigned long long int x;00655 __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));00656 return x;00657 }00658 #else00659 static inline unsigned long long int rdtsc()00660 {00661 struct timeval tv;00662 00663 gettimeofday (&tv, NULL);00664 return (tv.tv_sec * 1000000 + tv.tv_usec);00665 }00666 #endif00667 00668 00669 //void *(* jmemcpy)(void *to, const void *from, size_t len) = memcpy;00670 00671 #define BUFSIZE 102400672 00673 void find_best_memcpy()00674 {00675 /* save library size on platforms without special memcpy impl. */00676 00677 unsigned long long t;00678 char *buf1, *buf2;00679 int i, j, best = 0;00680 __u32 config_flags = detect_mm_accel();00681 00682 if (!(buf1 = (char*) malloc( BUFSIZE * 2000 * sizeof(char) )))00683 return;00684 00685 if (!(buf2 = (char*) malloc( BUFSIZE * 2000 * sizeof(char) ))) {00686 free( buf1 );00687 return;00688 }00689 00690 memset(buf1,0, BUFSIZE*2000);00691 memset(buf2,0, BUFSIZE*2000);00692 00693 /* make sure buffers are present on physical memory */00694 memcpy( buf1, buf2, BUFSIZE * 2000 );00695 memcpy( buf2, buf1, BUFSIZE * 2000 );00696 func("Finding best memory copy function");00697 for (i=1; memcpy_method[i].name; i++) {00698 if (memcpy_method[i].cpu_require & ~config_flags)00699 continue;00700 00701 t = rdtsc();00702 00703 for (j=0; j<2000; j++)00704 memcpy_method[i].function( buf1 + j*BUFSIZE, buf2 + j*BUFSIZE, BUFSIZE );00705 00706 t = rdtsc() - t;00707 memcpy_method[i].time = t;00708 00709 func("%s : time %2.2f",00710 memcpy_method[i].name, (float) ( (float) t / 1000000.0));00711 00712 if (best == 0 || t < memcpy_method[best].time)00713 best = i;00714 }00715 00716 if (best) {00717 notice("Using memory-to-memory copy method : %s",00718 memcpy_method[best].name);00719 00720 jmemcpy = memcpy_method[best].function;00721 }00722 00723 free( buf1 );00724 free( buf2 );00725 }00726