1 /*
2 * Copyright (C) 2001-2014 the xine project <xine-user@lists.sourceforge.net>
3 *
4 * This file is part of xine, a free video player.
5 *
6 * xine is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * xine is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110, USA
19 *
20 * These are the MMX/MMX2/SSE optimized versions of memcpy
21 *
22 * This code was adapted from Linux Kernel sources by Nick Kurshev to
23 * the mplayer program. (http://mplayer.sourceforge.net)
24 *
25 * Miguel Freitas split the #ifdefs into several specialized functions that
26 * are benchmarked at runtime by xine. Some original comments from Nick
27 * have been preserved documenting some MMX/SSE oddities.
28 * Also added kernel memcpy function that seems faster than libc one.
29 *
30 * 2004-12-06
31 * Copied this source into the Kwave project and adapted it to compile
32 * cleanly within this new environment
33 * by Thomas Eschenbacher <Thomas.Eschenbacher@gmx.de>
34 * Marked most changes with "#ifdef XINE_COMPILE"
35 *
36 * 2009-09-12
37 * synced with latest cvs version from sourceforge.net,
38 * xine/xine-lib/src/xine-utils/memcpy.c, rev. 1.44, 2007-07-20 20:00:36
39 *
40 * 2014-05-26
41 * synced with latest hg version, xine-lib-1-2-02e5a69f56c9
42 *
43 * 2015-09-19
44 * keyword "extern" was missing for probe_fast_memcpy()
45 */
46
47 #ifdef HAVE_CONFIG_H
48 #include "config.h"
49 #endif
50
51 #ifdef HAVE_SYS_TIMES_H
52 #include <sys/times.h>
53 #else
54 #include <time.h>
55 #endif
56
57 #include <stdlib.h>
58 #include <string.h>
59 #include <stdint.h> /* for uint_64 */
60
61 #ifndef XINE_COMPILE
62
63 #include "cputest.h"
64 #include <stdio.h>
65
66 #define _(m) m
67 #define xprintf printf
68
69 extern void probe_fast_memcpy(void) ;
70 extern void *(* xine_fast_memcpy)(void *to, const void *from, size_t len);
71
72 #define LOG_MODULE "memcpy"
73 #define LOG_VERBOSE
74 #define HAVE_AVX
75
76 #else /* XINE_COMPILE */
77
78 /*
79 #define LOG
80 */
81
82 #include <xine/xine_internal.h>
83 #include "../xine-engine/xine_private.h"
84
85 #endif /* XINE_COMPILE */
86
87 void *(* xine_fast_memcpy)(void *to, const void *from, size_t len);
88
89 /* Original comments from mplayer (file: aclib.c)
90 This part of code was taken by me from Linux-2.4.3 and slightly modified
91 for MMX, MMX2, SSE instruction set. I have done it since linux uses page aligned
92 blocks but mplayer uses weakly ordered data and original sources can not
93 speedup them. Only using PREFETCHNTA and MOVNTQ together have effect!
94
95 >From IA-32 Intel Architecture Software Developer's Manual Volume 1,
96
97 Order Number 245470:
98 "10.4.6. Cacheability Control, Prefetch, and Memory Ordering Instructions"
99
100 Data referenced by a program can be temporal (data will be used again) or
101 non-temporal (data will be referenced once and not reused in the immediate
102 future). To make efficient use of the processor's caches, it is generally
103 desirable to cache temporal data and not cache non-temporal data. Overloading
104 the processor's caches with non-temporal data is sometimes referred to as
105 "polluting the caches".
106 The non-temporal data is written to memory with Write-Combining semantics.
107
108 The PREFETCHh instructions permits a program to load data into the processor
109 at a suggested cache level, so that it is closer to the processors load and
110 store unit when it is needed. If the data is already present in a level of
111 the cache hierarchy that is closer to the processor, the PREFETCHh instruction
112 will not result in any data movement.
113 But we should you PREFETCHNTA: Non-temporal data fetch data into location
114 close to the processor, minimizing cache pollution.
115
116 The MOVNTQ (store quadword using non-temporal hint) instruction stores
117 packed integer data from an MMX register to memory, using a non-temporal hint.
118 The MOVNTPS (store packed single-precision floating-point values using
119 non-temporal hint) instruction stores packed floating-point data from an
120 XMM register to memory, using a non-temporal hint.
121
122 The SFENCE (Store Fence) instruction controls write ordering by creating a
123 fence for memory store operations. This instruction guarantees that the results
124 of every store instruction that precedes the store fence in program order is
125 globally visible before any store instruction that follows the fence. The
126 SFENCE instruction provides an efficient way of ensuring ordering between
127 procedures that produce weakly-ordered data and procedures that consume that
128 data.
129
130 If you have questions please contact with me: Nick Kurshev: nickols_k@mail.ru.
131 */
132
133 /* mmx v.1 Note: Since we added alignment of destinition it speedups
134 of memory copying on PentMMX, Celeron-1 and P2 upto 12% versus
135 standard (non MMX-optimized) version.
136 Note: on K6-2+ it speedups memory copying upto 25% and
137 on K7 and P3 about 500% (5 times).
138 */
139
140 /* Additional notes on gcc assembly and processors: [MF]
141 prefetch is specific for AMD processors, the intel ones should be
142 prefetch0, prefetch1, prefetch2 which are not recognized by my gcc.
143 prefetchnta is supported both on athlon and pentium 3.
144
145 therefore i will take off prefetchnta instructions from the mmx1 version
146 to avoid problems on pentium mmx and k6-2.
147
148 quote of the day:
149 "Using prefetches efficiently is more of an art than a science"
150 */
151
152
153 #if defined(ARCH_X86) || defined(ARCH_X86_64)
154
155 /* for small memory blocks (<256 bytes) this version is faster */
156 #define small_memcpy(to,from,n)\
157 {\
158 register uintptr_t dummy;\
159 __asm__ __volatile__(\
160 "rep; movsb"\
161 :"=&D"(to), "=&S"(from), "=&c"(dummy)\
162 :"0" (to), "1" (from),"2" (n)\
163 : "memory");\
164 }
165
166 /* linux kernel __memcpy (from: /include/asm/string.h) */
linux_kernel_memcpy_impl(void * to,const void * from,size_t n)167 static __inline__ void * linux_kernel_memcpy_impl (
168 void * to,
169 const void * from,
170 size_t n)
171 {
172 int d0, d1, d2;
173
174 if( n < 4 ) {
175 small_memcpy(to,from,n);
176 }
177 else
178 __asm__ __volatile__(
179 "rep ; movsl\n\t"
180 "testb $2,%b4\n\t"
181 "je 1f\n\t"
182 "movsw\n"
183 "1:\ttestb $1,%b4\n\t"
184 "je 2f\n\t"
185 "movsb\n"
186 "2:"
187 : "=&c" (d0), "=&D" (d1), "=&S" (d2)
188 :"0" (n/4), "q" (n),"1" ((uintptr_t) to),"2" ((uintptr_t) from)
189 : "memory");
190
191 return (to);
192 }
193
194 #define AVX_MMREG_SIZE 32
195 #define SSE_MMREG_SIZE 16
196 #define MMX_MMREG_SIZE 8
197
198 #define MMX1_MIN_LEN 0x800 /* 2K blocks */
199 #define MIN_LEN 0x40 /* 64-byte blocks */
200
201 /* SSE note: i tried to move 128 bytes a time instead of 64 but it
202 didn't make any measureable difference. i'm using 64 for the sake of
203 simplicity. [MF] */
sse_memcpy(void * to,const void * from,size_t len)204 static void * sse_memcpy(void * to, const void * from, size_t len)
205 {
206 void *retval;
207 size_t i;
208 retval = to;
209
210 /* PREFETCH has effect even for MOVSB instruction ;) */
211 __asm__ __volatile__ (
212 " prefetchnta (%0)\n"
213 " prefetchnta 32(%0)\n"
214 " prefetchnta 64(%0)\n"
215 " prefetchnta 96(%0)\n"
216 " prefetchnta 128(%0)\n"
217 " prefetchnta 160(%0)\n"
218 " prefetchnta 192(%0)\n"
219 " prefetchnta 224(%0)\n"
220 " prefetchnta 256(%0)\n"
221 " prefetchnta 288(%0)\n"
222 : : "r" (from) );
223
224 if(len >= MIN_LEN)
225 {
226 register uintptr_t delta;
227 /* Align destinition to MMREG_SIZE -boundary */
228 delta = ((uintptr_t)to)&(SSE_MMREG_SIZE-1);
229 if(delta)
230 {
231 delta=SSE_MMREG_SIZE-delta;
232 len -= delta;
233 small_memcpy(to, from, delta);
234 }
235 i = len >> 6; /* len/64 */
236 len&=63;
237 if(((uintptr_t)from) & 15)
238 /* if SRC is misaligned */
239 for(; i>0; i--)
240 {
241 __asm__ __volatile__ (
242 "prefetchnta 320(%0)\n"
243 "prefetchnta 352(%0)\n"
244 "movups (%0), %%xmm0\n"
245 "movups 16(%0), %%xmm1\n"
246 "movups 32(%0), %%xmm2\n"
247 "movups 48(%0), %%xmm3\n"
248 "movntps %%xmm0, (%1)\n"
249 "movntps %%xmm1, 16(%1)\n"
250 "movntps %%xmm2, 32(%1)\n"
251 "movntps %%xmm3, 48(%1)\n"
252 :: "r" (from), "r" (to) : "memory");
253 from = ((const unsigned char *)from) + 64;
254 to = ((unsigned char *)to) + 64;
255 }
256 else
257 /*
258 Only if SRC is aligned on 16-byte boundary.
259 It allows to use movaps instead of movups, which required data
260 to be aligned or a general-protection exception (#GP) is generated.
261 */
262 for(; i>0; i--)
263 {
264 __asm__ __volatile__ (
265 "prefetchnta 320(%0)\n"
266 "prefetchnta 352(%0)\n"
267 "movaps (%0), %%xmm0\n"
268 "movaps 16(%0), %%xmm1\n"
269 "movaps 32(%0), %%xmm2\n"
270 "movaps 48(%0), %%xmm3\n"
271 "movntps %%xmm0, (%1)\n"
272 "movntps %%xmm1, 16(%1)\n"
273 "movntps %%xmm2, 32(%1)\n"
274 "movntps %%xmm3, 48(%1)\n"
275 :: "r" (from), "r" (to) : "memory");
276 from = ((const unsigned char *)from) + 64;
277 to = ((unsigned char *)to) + 64;
278 }
279 /* since movntq is weakly-ordered, a "sfence"
280 * is needed to become ordered again. */
281 __asm__ __volatile__ ("sfence":::"memory");
282 }
283 /*
284 * Now do the tail of the block
285 */
286 if(len) linux_kernel_memcpy_impl(to, from, len);
287 return retval;
288 }
289
290 #ifdef HAVE_AVX
avx_memcpy(void * to,const void * from,size_t len)291 static void * avx_memcpy(void * to, const void * from, size_t len)
292 {
293 void *retval;
294 size_t i;
295 retval = to;
296
297 /* PREFETCH has effect even for MOVSB instruction ;) */
298 __asm__ __volatile__ (
299 " prefetchnta (%0)\n"
300 " prefetchnta 32(%0)\n"
301 " prefetchnta 64(%0)\n"
302 " prefetchnta 96(%0)\n"
303 " prefetchnta 128(%0)\n"
304 " prefetchnta 160(%0)\n"
305 " prefetchnta 192(%0)\n"
306 " prefetchnta 224(%0)\n"
307 " prefetchnta 256(%0)\n"
308 " prefetchnta 288(%0)\n"
309 : : "r" (from) );
310
311 if(len >= MIN_LEN)
312 {
313 register uintptr_t delta;
314 /* Align destinition to MMREG_SIZE -boundary */
315 delta = ((uintptr_t)to)&(AVX_MMREG_SIZE-1);
316 if(delta)
317 {
318 delta=AVX_MMREG_SIZE-delta;
319 len -= delta;
320 small_memcpy(to, from, delta);
321 }
322 i = len >> 7; /* len/128 */
323 len&=127;
324 if(((uintptr_t)from) & 31)
325 /* if SRC is misaligned */
326 for(; i>0; i--)
327 {
328 __asm__ __volatile__ (
329 "prefetchnta 320(%0)\n"
330 "prefetchnta 352(%0)\n"
331 "prefetchnta 384(%0)\n"
332 "prefetchnta 416(%0)\n"
333 "vmovups (%0), %%ymm0\n"
334 "vmovups 32(%0), %%ymm1\n"
335 "vmovups 64(%0), %%ymm2\n"
336 "vmovups 96(%0), %%ymm3\n"
337 "vmovntps %%ymm0, (%1)\n"
338 "vmovntps %%ymm1, 32(%1)\n"
339 "vmovntps %%ymm2, 64(%1)\n"
340 "vmovntps %%ymm3, 96(%1)\n"
341 :: "r" (from), "r" (to) : "memory");
342 from = ((const unsigned char *)from) + 128;
343 to = ((unsigned char *)to) + 128;
344 }
345 else
346 /*
347 Only if SRC is aligned on 16-byte boundary.
348 It allows to use movaps instead of movups, which required data
349 to be aligned or a general-protection exception (#GP) is generated.
350 */
351 for(; i>0; i--)
352 {
353 __asm__ __volatile__ (
354 "prefetchnta 320(%0)\n"
355 "prefetchnta 352(%0)\n"
356 "prefetchnta 384(%0)\n"
357 "prefetchnta 416(%0)\n"
358 "vmovaps (%0), %%ymm0\n"
359 "vmovaps 32(%0), %%ymm1\n"
360 "vmovaps 64(%0), %%ymm2\n"
361 "vmovaps 96(%0), %%ymm3\n"
362 "vmovntps %%ymm0, (%1)\n"
363 "vmovntps %%ymm1, 32(%1)\n"
364 "vmovntps %%ymm2, 64(%1)\n"
365 "vmovntps %%ymm3, 96(%1)\n"
366 :: "r" (from), "r" (to) : "memory");
367 from = ((const unsigned char *)from) + 128;
368 to = ((unsigned char *)to) + 128;
369 }
370 /* since movntq is weakly-ordered, a "sfence"
371 * is needed to become ordered again. */
372 __asm__ __volatile__ ("sfence":::"memory");
373 __asm__ __volatile__ ("vzeroupper");
374 }
375 /*
376 * Now do the tail of the block
377 */
378 if(len) linux_kernel_memcpy_impl(to, from, len);
379 return retval;
380 }
381 #endif /* HAVE_AVX */
382
mmx_memcpy(void * to,const void * from,size_t len)383 static void * mmx_memcpy(void * to, const void * from, size_t len)
384 {
385 void *retval;
386 size_t i;
387 retval = to;
388
389 if(len >= MMX1_MIN_LEN)
390 {
391 register uintptr_t delta;
392 /* Align destinition to MMREG_SIZE -boundary */
393 delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
394 if(delta)
395 {
396 delta=MMX_MMREG_SIZE-delta;
397 len -= delta;
398 small_memcpy(to, from, delta);
399 }
400 i = len >> 6; /* len/64 */
401 len&=63;
402 for(; i>0; i--)
403 {
404 __asm__ __volatile__ (
405 "movq (%0), %%mm0\n"
406 "movq 8(%0), %%mm1\n"
407 "movq 16(%0), %%mm2\n"
408 "movq 24(%0), %%mm3\n"
409 "movq 32(%0), %%mm4\n"
410 "movq 40(%0), %%mm5\n"
411 "movq 48(%0), %%mm6\n"
412 "movq 56(%0), %%mm7\n"
413 "movq %%mm0, (%1)\n"
414 "movq %%mm1, 8(%1)\n"
415 "movq %%mm2, 16(%1)\n"
416 "movq %%mm3, 24(%1)\n"
417 "movq %%mm4, 32(%1)\n"
418 "movq %%mm5, 40(%1)\n"
419 "movq %%mm6, 48(%1)\n"
420 "movq %%mm7, 56(%1)\n"
421 :: "r" (from), "r" (to) : "memory");
422 from = ((const unsigned char *)from) + 64;
423 to = ((unsigned char *)to) + 64;
424 }
425 __asm__ __volatile__ ("emms":::"memory");
426 }
427 /*
428 * Now do the tail of the block
429 */
430 if(len) linux_kernel_memcpy_impl(to, from, len);
431 return retval;
432 }
433
mmx2_memcpy(void * to,const void * from,size_t len)434 static void * mmx2_memcpy(void * to, const void * from, size_t len)
435 {
436 void *retval;
437 size_t i;
438 retval = to;
439
440 /* PREFETCH has effect even for MOVSB instruction ;) */
441 __asm__ __volatile__ (
442 " prefetchnta (%0)\n"
443 " prefetchnta 32(%0)\n"
444 " prefetchnta 64(%0)\n"
445 " prefetchnta 96(%0)\n"
446 " prefetchnta 128(%0)\n"
447 " prefetchnta 160(%0)\n"
448 " prefetchnta 192(%0)\n"
449 " prefetchnta 224(%0)\n"
450 " prefetchnta 256(%0)\n"
451 " prefetchnta 288(%0)\n"
452 : : "r" (from) );
453
454 if(len >= MIN_LEN)
455 {
456 register uintptr_t delta;
457 /* Align destinition to MMREG_SIZE -boundary */
458 delta = ((uintptr_t)to)&(MMX_MMREG_SIZE-1);
459 if(delta)
460 {
461 delta=MMX_MMREG_SIZE-delta;
462 len -= delta;
463 small_memcpy(to, from, delta);
464 }
465 i = len >> 6; /* len/64 */
466 len&=63;
467 for(; i>0; i--)
468 {
469 __asm__ __volatile__ (
470 "prefetchnta 320(%0)\n"
471 "prefetchnta 352(%0)\n"
472 "movq (%0), %%mm0\n"
473 "movq 8(%0), %%mm1\n"
474 "movq 16(%0), %%mm2\n"
475 "movq 24(%0), %%mm3\n"
476 "movq 32(%0), %%mm4\n"
477 "movq 40(%0), %%mm5\n"
478 "movq 48(%0), %%mm6\n"
479 "movq 56(%0), %%mm7\n"
480 "movntq %%mm0, (%1)\n"
481 "movntq %%mm1, 8(%1)\n"
482 "movntq %%mm2, 16(%1)\n"
483 "movntq %%mm3, 24(%1)\n"
484 "movntq %%mm4, 32(%1)\n"
485 "movntq %%mm5, 40(%1)\n"
486 "movntq %%mm6, 48(%1)\n"
487 "movntq %%mm7, 56(%1)\n"
488 :: "r" (from), "r" (to) : "memory");
489 from = ((const unsigned char *)from) + 64;
490 to = ((unsigned char *)to) + 64;
491 }
492 /* since movntq is weakly-ordered, a "sfence"
493 * is needed to become ordered again. */
494 __asm__ __volatile__ ("sfence":::"memory");
495 __asm__ __volatile__ ("emms":::"memory");
496 }
497 /*
498 * Now do the tail of the block
499 */
500 if(len) linux_kernel_memcpy_impl(to, from, len);
501 return retval;
502 }
503
linux_kernel_memcpy(void * to,const void * from,size_t len)504 static void *linux_kernel_memcpy(void *to, const void *from, size_t len) {
505 return linux_kernel_memcpy_impl(to,from,len);
506 }
507 #endif /* ARCH_X86 */
508
509 static const struct {
510 const char name[16];
511 void *(*const function)(void *to, const void *from, size_t len);
512
513 uint32_t cpu_require;
514 } memcpy_method[] =
515 {
516 { "", NULL, 0 },
517 { "libc", memcpy, 0 },
518 #if (defined(ARCH_X86) || defined(ARCH_X86_64))
519 { "linux kernel", linux_kernel_memcpy, 0 },
520 { "MMX ", mmx_memcpy, MM_MMX },
521 { "MMXEXT", mmx2_memcpy, MM_MMXEXT },
522 { "SSE", sse_memcpy, MM_MMXEXT|MM_SSE },
523 # ifdef HAVE_AVX
524 { "AVX", avx_memcpy, MM_ACCEL_X86_AVX },
525 # endif /* HAVE_AVX */
526 #endif /* ARCH_X86 */
527 { "", NULL, 0 }
528 };
529
530 static uint64_t memcpy_timing[sizeof(memcpy_method)/sizeof(memcpy_method[0])] = { 0, };
531
532 #ifdef HAVE_POSIX_TIMERS
533 /* Prefer clock_gettime() where available. */
534
535 # ifndef CLOCK_THREAD_CPUTIME_ID
536 /* not defined in NetBSD (bug #535) */
537 # define CLOCK_THREAD_CPUTIME_ID CLOCK_MONOTONIC
538 # endif
539
_x_gettime(void)540 static int64_t _x_gettime(void)
541 {
542 struct timespec tm;
543 return (clock_gettime (CLOCK_THREAD_CPUTIME_ID, &tm) == -1)
544 ? times (NULL)
545 : (int64_t)tm.tv_sec * 1e9 + tm.tv_nsec;
546 }
547 # define rdtsc(x) _x_gettime()
548
549 #elif (defined(ARCH_X86) || defined(ARCH_X86_64)) && defined(HAVE_SYS_TIMES_H)
rdtsc(int config_flags)550 static int64_t rdtsc(int config_flags)
551 {
552 int64_t x;
553
554 /* that should prevent us from trying cpuid with old cpus */
555 if( config_flags & MM_MMX ) {
556 __asm__ volatile (".byte 0x0f, 0x31" : "=A" (x));
557 return x;
558 } else {
559 return times(NULL);
560 }
561 }
562 #else
563
rdtsc(int config_flags)564 static uint64_t rdtsc(int config_flags)
565 {
566 (void)config_flags;
567 /* FIXME: implement an equivalent for using optimized memcpy on other
568 architectures */
569 #ifdef HAVE_SYS_TIMES_H
570 struct tms tp;
571 return times(&tp);
572 #else
573 return clock();
574 #endif /* HAVE_SYS_TIMES_H */
575 }
576 #endif
577
578 #ifdef XINE_COMPILE
update_fast_memcpy(void * user_data,xine_cfg_entry_t * entry)579 static void update_fast_memcpy(void *user_data, xine_cfg_entry_t *entry) {
580 static int config_flags = -1;
581 xine_t *xine = (xine_t *) user_data;
582 int method;
583
584 config_flags = xine_mm_accel();
585
586 method = entry->num_value;
587
588 if (method != 0
589 && (config_flags & memcpy_method[method].cpu_require) ==
590 memcpy_method[method].cpu_require ) {
591 lprintf("using %s memcpy()\n", memcpy_method[method].name );
592 xine_fast_memcpy = memcpy_method[method].function;
593 return;
594 } else {
595 xprintf(xine, XINE_VERBOSITY_DEBUG, "xine: will probe memcpy on startup\n" );
596 }
597 }
598 #endif /* XINE_COMPILE */
599
600 #define BUFSIZE 1024*1024
601 #ifdef XINE_COMPILE
xine_probe_fast_memcpy(xine_t * xine)602 void xine_probe_fast_memcpy(xine_t *xine)
603 #else
604 void probe_fast_memcpy(void)
605 #endif /* XINE_COMPILE */
606 {
607 uint64_t t;
608 char *buf1, *buf2;
609 int i, j, best;
610 int config_flags = -1;
611
612 #ifdef XINE_COMPILE
613
614 static const char *const memcpy_methods[] = {
615 "probe", "libc",
616 #if (defined(ARCH_X86) || defined(ARCH_X86_64))
617 "kernel", "mmx", "mmxext", "sse",
618 # ifdef HAVE_AVX
619 "avx",
620 # endif /* HAVE_AVX */
621 #endif
622 NULL
623 };
624
625 config_flags = xine_mm_accel();
626
627 best = xine->config->register_enum (xine->config, "engine.performance.memcpy_method", 0,
628 (char **)memcpy_methods,
629 _("memcopy method used by xine"),
630 _("The copying of large memory blocks is one of the most "
631 "expensive operations on todays computers. Therefore xine "
632 "provides various tuned methods to do this copying. "
633 "Usually, the best method is detected automatically."),
634 20, update_fast_memcpy, (void *) xine);
635
636 /* check if function is configured and valid for this machine */
637 if( best != 0 &&
638 best < sizeof(memcpy_methods)/sizeof(memcpy_method[0]) &&
639 (config_flags & memcpy_method[best].cpu_require) ==
640 memcpy_method[best].cpu_require ) {
641 lprintf("using %s memcpy()\n", memcpy_method[best].name );
642 xine_fast_memcpy = memcpy_method[best].function;
643 return;
644 }
645 #else /* XINE_COMPILE */
646 config_flags = xine_mm_accel();
647 #endif /* XINE_COMPILE */
648 best = 0;
649
650 xine_fast_memcpy = memcpy;
651
652 if( (buf1 = malloc(BUFSIZE)) == NULL )
653 return;
654
655 if( (buf2 = malloc(BUFSIZE)) == NULL ) {
656 free(buf1);
657 return;
658 }
659
660 xprintf(_("Benchmarking memcpy methods (smaller is better):\n"));
661 /* make sure buffers are present on physical memory */
662 memset(buf1,0,BUFSIZE);
663 memset(buf2,0,BUFSIZE);
664
665 /* some initial activity to ensure that we're not running slowly :-) */
666 for(j=0;j<50;j++) {
667 memcpy_method[1].function(buf2,buf1,BUFSIZE);
668 memcpy_method[1].function(buf1,buf2,BUFSIZE);
669 }
670
671 for(i=1; memcpy_method[i].name[0]; i++)
672 {
673 if( (config_flags & memcpy_method[i].cpu_require) !=
674 memcpy_method[i].cpu_require )
675 continue;
676
677 t = rdtsc(config_flags);
678 for(j=0;j<50;j++) {
679 memcpy_method[i].function(buf2,buf1,BUFSIZE);
680 memcpy_method[i].function(buf1,buf2,BUFSIZE);
681 }
682
683 t = rdtsc(config_flags) - t;
684 memcpy_timing[i] = t;
685
686 xprintf("\t%s : %lld\n", memcpy_method[i].name, (long long int)t);
687
688 if( best == 0 || t < memcpy_timing[best] )
689 best = i;
690 }
691
692 #ifdef XINE_COMPILE
693 xine->config->update_num (xine->config, "engine.performance.memcpy_method", best);
694 #else /* XINE_COMPILE */
695 xprintf("using -> '%s'\n", memcpy_method[best].name);
696 xine_fast_memcpy = memcpy_method[best].function;
697 #endif /* XINE_COMPILE */
698
699 free(buf1);
700 free(buf2);
701 }
702
703