1 /*
2  * Copyright (C) 2001-2014 the xine project <xine-user@lists.sourceforge.net>
3  *
4  * This file is part of xine, a free video player.
5  *
6  * xine is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * xine is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
19  *
20  * These are the MMX/MMX2/SSE optimized versions of memcpy
21  *
22  * This code was adapted from Linux Kernel sources by Nick Kurshev to
23  * the mplayer program. (http://mplayer.sourceforge.net)
24  *
25  * Miguel Freitas split the #ifdefs into several specialized functions that
26  * are benchmarked at runtime by xine. Some original comments from Nick
27  * have been preserved documenting some MMX/SSE oddities.
28  * Also added kernel memcpy function that seems faster than libc one.
29  *
30  * 2004-12-06
31  *   Copied this source into the Kwave project and adapted it to compile
32  *   cleanly within this new environment
33  *   by Thomas Eschenbacher <Thomas.Eschenbacher@gmx.de>
34  *   Marked most changes with "#ifdef XINE_COMPILE"
35  *
36  * 2009-09-12
37  *   synced with latest cvs version from sourceforge.net,
38  *   xine/xine-lib/src/xine-utils/memcpy.c, rev. 1.44, 2007-07-20 20:00:36
39  *
40  * 2014-05-26
41  *   synced with latest hg version, xine-lib-1-2-02e5a69f56c9
42  *
43  * 2015-09-19
44  *   keyword "extern" was missing for probe_fast_memcpy()
45  */
46 
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50 
51 #ifdef HAVE_SYS_TIMES_H
52 #include <sys/times.h>
53 #else
54 #include <time.h>
55 #endif
56 
57 #include <stdlib.h>
58 #include <string.h>
59 #include <stdint.h> /* for uint_64 */
60 
61 #ifndef XINE_COMPILE
62 
63 #include "cputest.h"
64 #include <stdio.h>
65 
66 #define _(m) m
67 #define xprintf printf
68 
69 extern void probe_fast_memcpy(void) ;
70 extern void *(* xine_fast_memcpy)(void *to, const void *from, size_t len);
71 
72 #define LOG_MODULE "memcpy"
73 #define LOG_VERBOSE
74 #define HAVE_AVX
75 
76 #else /* XINE_COMPILE */
77 
78 /*
79 #define LOG
80 */
81 
82 #include <xine/xine_internal.h>
83 #include "../xine-engine/xine_private.h"
84 
85 #endif /* XINE_COMPILE */
86 
87 void *(* xine_fast_memcpy)(void *to, const void *from, size_t len);
88 
89 /* Original comments from mplayer (file: aclib.c)
90  This part of code was taken by me from Linux-2.4.3 and slightly modified
91 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
92 blocks but mplayer uses weakly ordered data and original sources can not
93 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
94 
95 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
96 
97 Order Number 245470:
98 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
99 
100 Data referenced by a program can be temporal (data will be used again) or
101 non-temporal (data will be referenced once and not reused in the immediate
102 future). To make efficient use of the processor's caches, it is generally
103 desirable to cache temporal data and not cache non-temporal data. Overloading
104 the processor's caches with non-temporal data is sometimes referred to as
105 "polluting the caches".
106 The non-temporal data is written to memory with Write-Combining semantics.
107 
108 The PREFETCHh instructions permits a program to load data into the processor
109 at a suggested cache level, so that it is closer to the processors load and
110 store unit when it is needed. If the data is already present in a level of
111 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
112 will not result in any data movement.
113 But we should you PREFETCHNTA: Non-temporal data fetch data into location
114 close to the processor, minimizing cache pollution.
115 
116 The MOVNTQ (store quadword using non-temporal hint) instruction stores
117 packed integer data from an MMX register to memory, using a non-temporal hint.
118 The MOVNTPS (store packed single-precision floating-point values using
119 non-temporal hint) instruction stores packed floating-point data from an
120 XMM register to memory, using a non-temporal hint.
121 
122 The SFENCE (Store Fence) instruction controls write ordering by creating a
123 fence for memory store operations. This instruction guarantees that the results
124 of every store instruction that precedes the store fence in program order is
125 globally visible before any store instruction that follows the fence. The
126 SFENCE instruction provides an efficient way of ensuring ordering between
127 procedures that produce weakly-ordered data and procedures that consume that
128 data.
129 
130 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
131 */
132 
133 /*  mmx v.1 Note: Since we added alignment of destinition it speedups
134     of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
135     standard (non MMX-optimized) version.
136     Note: on K6-2+ it speedups memory copying upto 25% and
137           on K7 and P3 about 500% (5 times).
138 */
139 
140 /* Additional notes on gcc assembly and processors: [MF]
141 prefetch is specific for AMD processors, the intel ones should be
142 prefetch0, prefetch1, prefetch2 which are not recognized by my gcc.
143 prefetchnta is supported both on athlon and pentium 3.
144 
145 therefore i will take off prefetchnta instructions from the mmx1 version
146 to avoid problems on pentium mmx and k6-2.
147 
148 quote of the day:
149 "Using prefetches efficiently is more of an art than a science"
150 */
151 
152 
153 #if defined(ARCH_X86) || defined(ARCH_X86_64)
154 
155 /* for small memory blocks (<256 bytes) this version is faster */
156 #define small_memcpy(to,from,n)\
157 {\
158 register uintptr_t dummy;\
159 __asm__ __volatile__(\
160   "rep; movsb"\
161   :"=&D"(to), "=&S"(from), "=&c"(dummy)\
162   :"0" (to), "1" (from),"2" (n)\
163   : "memory");\
164 }
165 
166 /* linux kernel __memcpy (from: /include/asm/string.h) */
linux_kernel_memcpy_impl(void * to,const void * from,size_t n)167 static __inline__ void * linux_kernel_memcpy_impl (
168 			       void * to,
169 			       const void * from,
170 			       size_t n)
171 {
172 int d0, d1, d2;
173 
174   if( n < 4 ) {
175     small_memcpy(to,from,n);
176   }
177   else
178     __asm__ __volatile__(
179     "rep ; movsl\n\t"
180     "testb $2,%b4\n\t"
181     "je 1f\n\t"
182     "movsw\n"
183     "1:\ttestb $1,%b4\n\t"
184     "je 2f\n\t"
185     "movsb\n"
186     "2:"
187     : "=&c" (d0), "=&D" (d1), "=&S" (d2)
188     :"0" (n/4), "q" (n),"1" ((uintptr_t) to),"2" ((uintptr_t) from)
189     : "memory");
190 
191   return (to);
192 }
193 
194 #define AVX_MMREG_SIZE 32
195 #define SSE_MMREG_SIZE 16
196 #define MMX_MMREG_SIZE 8
197 
198 #define MMX1_MIN_LEN 0x800  /* 2K blocks */
199 #define MIN_LEN 0x40  /* 64-byte blocks */
200 
201 /* SSE note: i tried to move 128 bytes a time instead of 64 but it
202 didn't make any measureable difference. i'm using 64 for the sake of
203 simplicity. [MF] */
sse_memcpy(void * to,const void * from,size_t len)204 static void * sse_memcpy(void * to, const void * from, size_t len)
205 {
206   void *retval;
207   size_t i;
208   retval = to;
209 
210   /* PREFETCH has effect even for MOVSB instruction ;) */
211   __asm__ __volatile__ (
212     "   prefetchnta (%0)\n"
213     "   prefetchnta 32(%0)\n"
214     "   prefetchnta 64(%0)\n"
215     "   prefetchnta 96(%0)\n"
216     "   prefetchnta 128(%0)\n"
217     "   prefetchnta 160(%0)\n"
218     "   prefetchnta 192(%0)\n"
219     "   prefetchnta 224(%0)\n"
220     "   prefetchnta 256(%0)\n"
221     "   prefetchnta 288(%0)\n"
222     : : "r" (from) );
223 
224   if(len >= MIN_LEN)
225   {
226     register uintptr_t delta;
227     /* Align destinition to MMREG_SIZE -boundary */
228     delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1);
229     if(delta)
230     {
231       delta=SSE_MMREG_SIZE-delta;
232       len -= delta;
233       small_memcpy(to, from, delta);
234     }
235     i = len >> 6; /* len/64 */
236     len&=63;
237     if(((uintptr_t)from) & 15)
238       /* if SRC is misaligned */
239       for(; i>0; i--)
240       {
241         __asm__ __volatile__ (
242         "prefetchnta 320(%0)\n"
243        "prefetchnta 352(%0)\n"
244         "movups (%0), %%xmm0\n"
245         "movups 16(%0), %%xmm1\n"
246         "movups 32(%0), %%xmm2\n"
247         "movups 48(%0), %%xmm3\n"
248         "movntps %%xmm0, (%1)\n"
249         "movntps %%xmm1, 16(%1)\n"
250         "movntps %%xmm2, 32(%1)\n"
251         "movntps %%xmm3, 48(%1)\n"
252         :: "r" (from), "r" (to) : "memory");
253         from = ((const unsigned char *)from) + 64;
254         to = ((unsigned char *)to) + 64;
255       }
256     else
257       /*
258          Only if SRC is aligned on 16-byte boundary.
259          It allows to use movaps instead of movups, which required data
260          to be aligned or a general-protection exception (#GP) is generated.
261       */
262       for(; i>0; i--)
263       {
264         __asm__ __volatile__ (
265         "prefetchnta 320(%0)\n"
266        "prefetchnta 352(%0)\n"
267         "movaps (%0), %%xmm0\n"
268         "movaps 16(%0), %%xmm1\n"
269         "movaps 32(%0), %%xmm2\n"
270         "movaps 48(%0), %%xmm3\n"
271         "movntps %%xmm0, (%1)\n"
272         "movntps %%xmm1, 16(%1)\n"
273         "movntps %%xmm2, 32(%1)\n"
274         "movntps %%xmm3, 48(%1)\n"
275         :: "r" (from), "r" (to) : "memory");
276         from = ((const unsigned char *)from) + 64;
277         to = ((unsigned char *)to) + 64;
278       }
279     /* since movntq is weakly-ordered, a "sfence"
280      * is needed to become ordered again. */
281     __asm__ __volatile__ ("sfence":::"memory");
282   }
283   /*
284    *	Now do the tail of the block
285    */
286   if(len) linux_kernel_memcpy_impl(to, from, len);
287   return retval;
288 }
289 
290 #ifdef HAVE_AVX
avx_memcpy(void * to,const void * from,size_t len)291 static void * avx_memcpy(void * to, const void * from, size_t len)
292 {
293   void *retval;
294   size_t i;
295   retval = to;
296 
297   /* PREFETCH has effect even for MOVSB instruction ;) */
298   __asm__ __volatile__ (
299     "   prefetchnta (%0)\n"
300     "   prefetchnta 32(%0)\n"
301     "   prefetchnta 64(%0)\n"
302     "   prefetchnta 96(%0)\n"
303     "   prefetchnta 128(%0)\n"
304     "   prefetchnta 160(%0)\n"
305     "   prefetchnta 192(%0)\n"
306     "   prefetchnta 224(%0)\n"
307     "   prefetchnta 256(%0)\n"
308     "   prefetchnta 288(%0)\n"
309     : : "r" (from) );
310 
311   if(len >= MIN_LEN)
312   {
313     register uintptr_t delta;
314     /* Align destinition to MMREG_SIZE -boundary */
315     delta = ((uintptr_t)to)&(AVX_MMREG_SIZE-1);
316     if(delta)
317     {
318       delta=AVX_MMREG_SIZE-delta;
319       len -= delta;
320       small_memcpy(to, from, delta);
321     }
322     i = len >> 7; /* len/128 */
323     len&=127;
324     if(((uintptr_t)from) & 31)
325       /* if SRC is misaligned */
326       for(; i>0; i--)
327       {
328         __asm__ __volatile__ (
329         "prefetchnta 320(%0)\n"
330         "prefetchnta 352(%0)\n"
331         "prefetchnta 384(%0)\n"
332         "prefetchnta 416(%0)\n"
333         "vmovups    (%0), %%ymm0\n"
334         "vmovups  32(%0), %%ymm1\n"
335         "vmovups  64(%0), %%ymm2\n"
336         "vmovups  96(%0), %%ymm3\n"
337         "vmovntps %%ymm0,   (%1)\n"
338         "vmovntps %%ymm1, 32(%1)\n"
339         "vmovntps %%ymm2, 64(%1)\n"
340         "vmovntps %%ymm3, 96(%1)\n"
341         :: "r" (from), "r" (to) : "memory");
342         from = ((const unsigned char *)from) + 128;
343         to = ((unsigned char *)to) + 128;
344       }
345     else
346       /*
347          Only if SRC is aligned on 16-byte boundary.
348          It allows to use movaps instead of movups, which required data
349          to be aligned or a general-protection exception (#GP) is generated.
350       */
351       for(; i>0; i--)
352       {
353         __asm__ __volatile__ (
354         "prefetchnta 320(%0)\n"
355         "prefetchnta 352(%0)\n"
356         "prefetchnta 384(%0)\n"
357         "prefetchnta 416(%0)\n"
358         "vmovaps    (%0), %%ymm0\n"
359         "vmovaps  32(%0), %%ymm1\n"
360         "vmovaps  64(%0), %%ymm2\n"
361         "vmovaps  96(%0), %%ymm3\n"
362         "vmovntps %%ymm0,   (%1)\n"
363         "vmovntps %%ymm1, 32(%1)\n"
364         "vmovntps %%ymm2, 64(%1)\n"
365         "vmovntps %%ymm3, 96(%1)\n"
366         :: "r" (from), "r" (to) : "memory");
367         from = ((const unsigned char *)from) + 128;
368         to = ((unsigned char *)to) + 128;
369       }
370     /* since movntq is weakly-ordered, a "sfence"
371      * is needed to become ordered again. */
372     __asm__ __volatile__ ("sfence":::"memory");
373     __asm__ __volatile__ ("vzeroupper");
374   }
375   /*
376    *	Now do the tail of the block
377    */
378   if(len) linux_kernel_memcpy_impl(to, from, len);
379   return retval;
380 }
381 #endif /* HAVE_AVX */
382 
mmx_memcpy(void * to,const void * from,size_t len)383 static void * mmx_memcpy(void * to, const void * from, size_t len)
384 {
385   void *retval;
386   size_t i;
387   retval = to;
388 
389   if(len >= MMX1_MIN_LEN)
390   {
391     register uintptr_t delta;
392     /* Align destinition to MMREG_SIZE -boundary */
393     delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
394     if(delta)
395     {
396       delta=MMX_MMREG_SIZE-delta;
397       len -= delta;
398       small_memcpy(to, from, delta);
399     }
400     i = len >> 6; /* len/64 */
401     len&=63;
402     for(; i>0; i--)
403     {
404       __asm__ __volatile__ (
405       "movq (%0), %%mm0\n"
406       "movq 8(%0), %%mm1\n"
407       "movq 16(%0), %%mm2\n"
408       "movq 24(%0), %%mm3\n"
409       "movq 32(%0), %%mm4\n"
410       "movq 40(%0), %%mm5\n"
411       "movq 48(%0), %%mm6\n"
412       "movq 56(%0), %%mm7\n"
413       "movq %%mm0, (%1)\n"
414       "movq %%mm1, 8(%1)\n"
415       "movq %%mm2, 16(%1)\n"
416       "movq %%mm3, 24(%1)\n"
417       "movq %%mm4, 32(%1)\n"
418       "movq %%mm5, 40(%1)\n"
419       "movq %%mm6, 48(%1)\n"
420       "movq %%mm7, 56(%1)\n"
421       :: "r" (from), "r" (to) : "memory");
422       from = ((const unsigned char *)from) + 64;
423       to = ((unsigned char *)to) + 64;
424     }
425     __asm__ __volatile__ ("emms":::"memory");
426   }
427   /*
428    *	Now do the tail of the block
429    */
430   if(len) linux_kernel_memcpy_impl(to, from, len);
431   return retval;
432 }
433 
mmx2_memcpy(void * to,const void * from,size_t len)434 static void * mmx2_memcpy(void * to, const void * from, size_t len)
435 {
436   void *retval;
437   size_t i;
438   retval = to;
439 
440   /* PREFETCH has effect even for MOVSB instruction ;) */
441   __asm__ __volatile__ (
442     "   prefetchnta (%0)\n"
443     "   prefetchnta 32(%0)\n"
444     "   prefetchnta 64(%0)\n"
445     "   prefetchnta 96(%0)\n"
446     "   prefetchnta 128(%0)\n"
447     "   prefetchnta 160(%0)\n"
448     "   prefetchnta 192(%0)\n"
449     "   prefetchnta 224(%0)\n"
450     "   prefetchnta 256(%0)\n"
451     "   prefetchnta 288(%0)\n"
452     : : "r" (from) );
453 
454   if(len >= MIN_LEN)
455   {
456     register uintptr_t delta;
457     /* Align destinition to MMREG_SIZE -boundary */
458     delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
459     if(delta)
460     {
461       delta=MMX_MMREG_SIZE-delta;
462       len -= delta;
463       small_memcpy(to, from, delta);
464     }
465     i = len >> 6; /* len/64 */
466     len&=63;
467     for(; i>0; i--)
468     {
469       __asm__ __volatile__ (
470       "prefetchnta 320(%0)\n"
471       "prefetchnta 352(%0)\n"
472       "movq (%0), %%mm0\n"
473       "movq 8(%0), %%mm1\n"
474       "movq 16(%0), %%mm2\n"
475       "movq 24(%0), %%mm3\n"
476       "movq 32(%0), %%mm4\n"
477       "movq 40(%0), %%mm5\n"
478       "movq 48(%0), %%mm6\n"
479       "movq 56(%0), %%mm7\n"
480       "movntq %%mm0, (%1)\n"
481       "movntq %%mm1, 8(%1)\n"
482       "movntq %%mm2, 16(%1)\n"
483       "movntq %%mm3, 24(%1)\n"
484       "movntq %%mm4, 32(%1)\n"
485       "movntq %%mm5, 40(%1)\n"
486       "movntq %%mm6, 48(%1)\n"
487       "movntq %%mm7, 56(%1)\n"
488       :: "r" (from), "r" (to) : "memory");
489       from = ((const unsigned char *)from) + 64;
490       to = ((unsigned char *)to) + 64;
491     }
492      /* since movntq is weakly-ordered, a "sfence"
493      * is needed to become ordered again. */
494     __asm__ __volatile__ ("sfence":::"memory");
495     __asm__ __volatile__ ("emms":::"memory");
496   }
497   /*
498    *	Now do the tail of the block
499    */
500   if(len) linux_kernel_memcpy_impl(to, from, len);
501   return retval;
502 }
503 
linux_kernel_memcpy(void * to,const void * from,size_t len)504 static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
505   return linux_kernel_memcpy_impl(to,from,len);
506 }
507 #endif /* ARCH_X86 */
508 
509 static const struct {
510   const char name[16];
511   void *(*const  function)(void *to, const void *from, size_t len);
512 
513   uint32_t cpu_require;
514 } memcpy_method[] =
515 {
516   { "", NULL, 0 },
517   { "libc", memcpy, 0 },
518 #if (defined(ARCH_X86) || defined(ARCH_X86_64))
519   { "linux kernel", linux_kernel_memcpy, 0 },
520   { "MMX ", mmx_memcpy, MM_MMX },
521   { "MMXEXT", mmx2_memcpy, MM_MMXEXT },
522   { "SSE", sse_memcpy, MM_MMXEXT|MM_SSE },
523 # ifdef HAVE_AVX
524   { "AVX", avx_memcpy, MM_ACCEL_X86_AVX },
525 # endif /* HAVE_AVX */
526 #endif /* ARCH_X86 */
527   { "", NULL, 0 }
528 };
529 
530 static uint64_t memcpy_timing[sizeof(memcpy_method)/sizeof(memcpy_method[0])] = { 0, };
531 
532 #ifdef HAVE_POSIX_TIMERS
533 /* Prefer clock_gettime() where available. */
534 
535 # ifndef CLOCK_THREAD_CPUTIME_ID
536 /*  not defined in NetBSD (bug #535) */
537 #   define CLOCK_THREAD_CPUTIME_ID CLOCK_MONOTONIC
538 # endif
539 
_x_gettime(void)540 static int64_t _x_gettime(void)
541 {
542   struct timespec tm;
543   return (clock_gettime (CLOCK_THREAD_CPUTIME_ID, &tm) == -1)
544        ? times (NULL)
545        : (int64_t)tm.tv_sec * 1e9 + tm.tv_nsec;
546 }
547 #  define rdtsc(x) _x_gettime()
548 
549 #elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
rdtsc(int config_flags)550 static int64_t rdtsc(int config_flags)
551 {
552   int64_t x;
553 
554   /* that should prevent us from trying cpuid with old cpus */
555   if( config_flags & MM_MMX ) {
556     __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
557     return x;
558   } else {
559     return times(NULL);
560   }
561 }
562 #else
563 
rdtsc(int config_flags)564 static uint64_t rdtsc(int config_flags)
565 {
566   (void)config_flags;
567   /* FIXME: implement an equivalent for using optimized memcpy on other
568             architectures */
569 #ifdef HAVE_SYS_TIMES_H
570   struct tms tp;
571   return times(&tp);
572 #else
573 	return clock();
574 #endif /* HAVE_SYS_TIMES_H */
575 }
576 #endif
577 
578 #ifdef XINE_COMPILE
update_fast_memcpy(void * user_data,xine_cfg_entry_t * entry)579 static void update_fast_memcpy(void *user_data, xine_cfg_entry_t *entry) {
580   static int   config_flags = -1;
581   xine_t      *xine = (xine_t *) user_data;
582   int          method;
583 
584   config_flags = xine_mm_accel();
585 
586   method = entry->num_value;
587 
588   if (method != 0
589       && (config_flags & memcpy_method[method].cpu_require) ==
590       memcpy_method[method].cpu_require ) {
591     lprintf("using %s memcpy()\n", memcpy_method[method].name );
592     xine_fast_memcpy = memcpy_method[method].function;
593     return;
594   } else {
595     xprintf(xine, XINE_VERBOSITY_DEBUG, "xine: will probe memcpy on startup\n" );
596   }
597 }
598 #endif /* XINE_COMPILE */
599 
600 #define BUFSIZE 1024*1024
601 #ifdef XINE_COMPILE
xine_probe_fast_memcpy(xine_t * xine)602 void xine_probe_fast_memcpy(xine_t *xine)
603 #else
604 void probe_fast_memcpy(void)
605 #endif /* XINE_COMPILE */
606 {
607   uint64_t          t;
608   char             *buf1, *buf2;
609   int               i, j, best;
610   int               config_flags = -1;
611 
612 #ifdef XINE_COMPILE
613 
614   static const char *const memcpy_methods[] = {
615     "probe", "libc",
616 #if (defined(ARCH_X86) || defined(ARCH_X86_64))
617     "kernel", "mmx", "mmxext", "sse",
618 # ifdef HAVE_AVX
619     "avx",
620 # endif /* HAVE_AVX */
621 #endif
622     NULL
623   };
624 
625   config_flags = xine_mm_accel();
626 
627   best = xine->config->register_enum (xine->config, "engine.performance.memcpy_method", 0,
628 				      (char **)memcpy_methods,
629 				      _("memcopy method used by xine"),
630 				      _("The copying of large memory blocks is one of the most "
631 					"expensive operations on todays computers. Therefore xine "
632 					"provides various tuned methods to do this copying. "
633 					"Usually, the best method is detected automatically."),
634 				      20, update_fast_memcpy, (void *) xine);
635 
636   /* check if function is configured and valid for this machine */
637   if( best != 0 &&
638       best < sizeof(memcpy_methods)/sizeof(memcpy_method[0]) &&
639      (config_flags & memcpy_method[best].cpu_require) ==
640       memcpy_method[best].cpu_require ) {
641     lprintf("using %s memcpy()\n", memcpy_method[best].name );
642     xine_fast_memcpy = memcpy_method[best].function;
643     return;
644   }
645 #else /* XINE_COMPILE */
646   config_flags = xine_mm_accel();
647 #endif /* XINE_COMPILE */
648   best = 0;
649 
650   xine_fast_memcpy = memcpy;
651 
652   if( (buf1 = malloc(BUFSIZE)) == NULL )
653     return;
654 
655   if( (buf2 = malloc(BUFSIZE)) == NULL ) {
656     free(buf1);
657     return;
658   }
659 
660   xprintf(_("Benchmarking memcpy methods (smaller is better):\n"));
661   /* make sure buffers are present on physical memory */
662   memset(buf1,0,BUFSIZE);
663   memset(buf2,0,BUFSIZE);
664 
665   /* some initial activity to ensure that we're not running slowly :-) */
666   for(j=0;j<50;j++) {
667     memcpy_method[1].function(buf2,buf1,BUFSIZE);
668     memcpy_method[1].function(buf1,buf2,BUFSIZE);
669   }
670 
671   for(i=1; memcpy_method[i].name[0]; i++)
672   {
673     if( (config_flags & memcpy_method[i].cpu_require) !=
674          memcpy_method[i].cpu_require )
675       continue;
676 
677     t = rdtsc(config_flags);
678     for(j=0;j<50;j++) {
679       memcpy_method[i].function(buf2,buf1,BUFSIZE);
680       memcpy_method[i].function(buf1,buf2,BUFSIZE);
681     }
682 
683     t = rdtsc(config_flags) - t;
684     memcpy_timing[i] = t;
685 
686     xprintf("\t%s : %lld\n", memcpy_method[i].name, (long long int)t);
687 
688     if( best == 0 || t < memcpy_timing[best] )
689       best = i;
690   }
691 
692 #ifdef XINE_COMPILE
693   xine->config->update_num (xine->config, "engine.performance.memcpy_method", best);
694 #else /* XINE_COMPILE */
695   xprintf("using -> '%s'\n", memcpy_method[best].name);
696   xine_fast_memcpy = memcpy_method[best].function;
697 #endif /* XINE_COMPILE */
698 
699   free(buf1);
700   free(buf2);
701 }
702 
703