1 /*
2  * memcpy.c - optimized memcpy() routines for aclib
3  * Written by Andrew Church <achurch@achurch.org>
4  *
5  * This file is part of transcode, a video stream processing tool.
6  * transcode is free software, distributable under the terms of the GNU
7  * General Public License (version 2 or later).  See the file COPYING
8  * for details.
9  */
10 
11 #include "ac.h"
12 #include "ac_internal.h"
13 #include <string.h>
14 
15 /* Use memmove because memcpy isn't guaranteed to be ascending */
16 static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;
17 
18 /*************************************************************************/
19 
20 /* External interface */
21 
ac_memcpy(void * dest,const void * src,size_t size)22 void *ac_memcpy(void *dest, const void *src, size_t size)
23 {
24     return (*memcpy_ptr)(dest, src, size);
25 }
26 
27 /*************************************************************************/
28 /*************************************************************************/
29 
30 /* Note the check for ARCH_X86 here: this is to prevent compilation of this
31  * code on x86_64, since all x86_64 processors support SSE2, and because
32  * this code is not set up to use the 64-bit registers for addressing on
33  * x86_64. */
34 
35 #if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
36 
37 /* MMX-optimized routine, intended for PMMX/PII processors.
38  * Nonstandard instructions used:
39  *     (CPUID.MMX)   MOVQ
40  */
41 
memcpy_mmx(void * dest,const void * src,size_t bytes)42 static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
43 {
44     asm("\
45 PENTIUM_LINE_SIZE = 32          # PMMX/PII cache line size              \n\
46 PENTIUM_CACHE_SIZE = 8192       # PMMX/PII total cache size             \n\
47 # Use only half because writes may touch the cache too (PII)            \n\
48 PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE)        \n\
49                                                                         \n\
50         push %%ebx              # Save PIC register                     \n\
51         push %%edi              # Save destination for return value     \n\
52         cld                     # MOVS* should ascend                   \n\
53                                                                         \n\
54         mov $64, %%ebx          # Constant                              \n\
55                                                                         \n\
56         cmp %%ebx, %%ecx                                                \n\
57         jb mmx.memcpy_last      # Just use movs if <64 bytes            \n\
58                                                                         \n\
59         # First align destination address to a multiple of 8 bytes      \n\
60         mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
61         sub %%edi, %%eax                                                \n\
62         and $7, %%eax           # ... which is the number of bytes to copy\n"
63 #ifdef ACLIB_DISABLE_X86_TEXTRELS  // Because "lea 0f" requires a textrel
64 "       xchg %%eax, %%ecx                                               \n\
65         mov %%ecx, %%edx                                                \n\
66         repz movsb                                                      \n\
67         mov %%eax, %%ecx                                                \n\
68         mov %%edx, %%eax                                                \n"
69 #else
70 "       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
71         sub %%eax, %%edx                                                \n\
72         jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
73         movsb                                                           \n\
74         movsb                                                           \n\
75         movsb                                                           \n\
76         movsb                                                           \n\
77         movsb                                                           \n\
78         movsb                                                           \n\
79         movsb                                                           \n"
80 #endif
81 "0:     sub %%eax, %%ecx        # Update count                          \n\
82                                                                         \n\
83         # Now copy data in blocks                                       \n\
84 0:      mov %%ecx, %%edx        # EDX <- ECX >> 6 (cache lines to copy) \n\
85         shr $6, %%edx                                                   \n\
86         jz mmx.memcpy_last      # <64 bytes left?  Skip to end          \n\
87         cmp $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
88         jb 1f                   # Limit size of block                   \n\
89         mov $PENTIUM_CACHE_BLOCK/64, %%edx                              \n\
90 1:      mov %%edx, %%eax        # EAX <- EDX << 6 (bytes to copy)       \n\
91         shl $6, %%eax                                                   \n\
92         sub %%eax, %%ecx        # Update remaining count                \n\
93         add %%eax, %%esi        # Point to end of region to be block-copied\n\
94 2:      test %%eax, -32(%%esi)  # Touch each cache line in reverse order\n\
95         test %%eax, -64(%%esi)                                          \n\
96         sub %%ebx, %%esi        # Update pointer                        \n\
97         sub %%ebx, %%eax        # And loop                              \n\
98         jnz 2b                                                          \n\
99         # Note that ESI now points to the beginning of the block        \n\
100 3:      movq   (%%esi), %%mm0   # Do the actual copy, 64 bytes at a time\n\
101         movq  8(%%esi), %%mm1                                           \n\
102         movq 16(%%esi), %%mm2                                           \n\
103         movq 24(%%esi), %%mm3                                           \n\
104         movq 32(%%esi), %%mm4                                           \n\
105         movq 40(%%esi), %%mm5                                           \n\
106         movq 48(%%esi), %%mm6                                           \n\
107         movq 56(%%esi), %%mm7                                           \n\
108         movq %%mm0,   (%%edi)                                           \n\
109         movq %%mm1,  8(%%edi)                                           \n\
110         movq %%mm2, 16(%%edi)                                           \n\
111         movq %%mm3, 24(%%edi)                                           \n\
112         movq %%mm4, 32(%%edi)                                           \n\
113         movq %%mm5, 40(%%edi)                                           \n\
114         movq %%mm6, 48(%%edi)                                           \n\
115         movq %%mm7, 56(%%edi)                                           \n\
116         add %%ebx, %%esi        # Update pointers                       \n\
117         add %%ebx, %%edi                                                \n\
118         dec %%edx               # And loop                              \n\
119         jnz 3b                                                          \n\
120         jmp 0b                                                          \n\
121                                                                         \n\
122 mmx.memcpy_last:                                                        \n\
123         # Copy last <64 bytes, using the computed jump trick            \n\
124         mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
125         shr $2, %%eax                                                   \n"
126 #ifdef ACLIB_DISABLE_X86_TEXTRELS
127 "       xchg %%eax, %%ecx                                               \n\
128         repz movsd                                                      \n\
129         mov %%eax, %%ecx                                                \n"
130 #else
131 "       lea 0f, %%edx                                                   \n\
132         sub %%eax, %%edx                                                \n\
133         jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
134         movsd                                                           \n\
135         movsd                                                           \n\
136         movsd                                                           \n\
137         movsd                                                           \n\
138         movsd                                                           \n\
139         movsd                                                           \n\
140         movsd                                                           \n\
141         movsd                                                           \n\
142         movsd                                                           \n\
143         movsd                                                           \n\
144         movsd                                                           \n\
145         movsd                                                           \n\
146         movsd                                                           \n\
147         movsd                                                           \n\
148         movsd                                                           \n"
149 #endif
150 "0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
151 #ifdef ACLIB_DISABLE_X86_TEXTRELS
152 "       repz movsb                                                      \n"
153 #else
154 "       lea 0f, %%edx                                                   \n\
155         sub %%ecx, %%edx                                                \n\
156         jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
157         movsb                                                           \n\
158         movsb                                                           \n\
159         movsb                                                           \n"
160 #endif
161 "0:                                                                     \n\
162         # All done!                                                     \n\
163         emms                    # Clean up MMX state                    \n\
164         pop %%edi               # Restore destination (return value)    \n\
165         pop %%ebx               # Restore PIC register                  \n\
166     " : /* no outputs */
167       : "D" (dest), "S" (src), "c" (bytes)
168       : "%eax", "%edx"
169     );
170     return dest;
171 }
172 
173 #endif  /* HAVE_ASM_MMX && ARCH_X86 */
174 
175 /*************************************************************************/
176 
177 #if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
178 
179 /* SSE-optimized routine.  Backported from AMD64 routine below.
180  * Nonstandard instructions used:
181  *     (CPUID.CMOVE) CMOVA
182  *     (CPUID.MMX)   MOVQ
183  *     (CPUID.SSE)   MOVNTQ
184  */
185 
memcpy_sse(void * dest,const void * src,size_t bytes)186 static void *memcpy_sse(void *dest, const void *src, size_t bytes)
187 {
188     asm("\
189         push %%ebx              # Save PIC register                     \n\
190         push %%edi              # Save destination for return value     \n\
191         cld                     # MOVS* should ascend                   \n\
192                                                                         \n\
193         cmp $64, %%ecx          # Skip block copy for small blocks      \n\
194         jb sse.memcpy_last                                              \n\
195                                                                         \n\
196         mov $128, %%ebx         # Constant used later                   \n\
197                                                                         \n\
198         # First align destination address to a multiple of 8 bytes      \n\
199         mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
200         sub %%edi, %%eax                                                \n\
201         and $7, %%eax           # ... which is the number of bytes to copy\n"
202 #ifdef ACLIB_DISABLE_X86_TEXTRELS
203 "       xchg %%eax, %%ecx                                               \n\
204         mov %%ecx, %%edx                                                \n\
205         repz movsb                                                      \n\
206         mov %%eax, %%ecx                                                \n\
207         mov %%edx, %%eax                                                \n"
208 #else
209 "       lea 0f, %%edx           # Use a computed jump--faster than a loop\n\
210         sub %%eax, %%edx                                                \n\
211         jmp *%%edx              # Execute 0-7 MOVSB's                   \n\
212         movsb                                                           \n\
213         movsb                                                           \n\
214         movsb                                                           \n\
215         movsb                                                           \n\
216         movsb                                                           \n\
217         movsb                                                           \n\
218         movsb                                                           \n"
219 #endif
220 "0:     sub %%eax, %%ecx        # Update count                          \n\
221                                                                         \n\
222         cmp $0x10040, %%ecx     # Is this a large block? (0x10040 is an \n\
223                                 # arbitrary value where prefetching and \n\
224                                 # write combining seem to start becoming\n\
225                                 # faster)                               \n\
226         jae sse.memcpy_bp       # Yup, use prefetch copy                \n\
227                                                                         \n\
228 sse.memcpy_small:               # Small block copy routine--no prefetch \n"
229 #if 0
230 "       mov %%ecx, %%edx        # EDX <- bytes to copy / 8              \n\
231         shr $3, %%edx                                                   \n\
232         mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
233         shl $3, %%eax                                                   \n\
234         sub %%eax, %%ecx                                                \n\
235         .balign 16                                                      \n\
236 0:      movq (%%esi), %%mm0     # Copy 8 bytes of data                  \n\
237         movq %%mm0, (%%edi)                                             \n\
238         add $8, %%esi           # Update pointers                       \n\
239         add $8, %%edi                                                   \n\
240         dec %%edx               # And loop                              \n\
241         jg 0b                                                           \n\
242         jmp sse.memcpy_last     # Copy any remaining bytes              \n\
243                                                                         \n\
244         nop                     # Align loops below                     \n"
245 #else
246 "       # It appears that a simple rep movs is faster than cleverness   \n\
247         # with movq...                                                  \n\
248         mov %%ecx, %%edx        # EDX <- ECX & 3                        \n\
249         and $3, %%edx                                                   \n\
250         shr $2, %%ecx           # ECX <- ECX >> 2                       \n\
251         rep movsl               # Copy away!                            \n\
252         mov %%edx, %%ecx        # Take care of last 0-3 bytes           \n\
253         rep movsb                                                       \n\
254         jmp sse.memcpy_end      # And exit                              \n\
255                                                                         \n\
256         .balign 16                                                      \n\
257         nop                                                             \n\
258         nop                                                             \n"
259 #endif
260 "sse.memcpy_bp:                 # Block prefetch copy routine           \n\
261 0:      mov %%ecx, %%edx        # EDX: temp counter                     \n\
262         shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
263         cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
264         cmova %%ebx, %%edx                                              \n\
265         shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
266         mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
267                                 #        (also used as memory offset)   \n\
268 1:      test %%eax, -64(%%esi,%%eax,8)  # Preload cache lines in pairs  \n\
269         test %%eax, -128(%%esi,%%eax,8) # (going backwards)             \n\
270         # (note that test %%eax,... seems to be faster than prefetchnta \n\
271         #  on x86)                                                      \n\
272         sub $16, %%eax          # And loop                              \n\
273         jg 1b                                                           \n\
274                                                                         \n\
275         # Then copy--forward, which seems to be faster than reverse for \n\
276         # certain alignments                                            \n\
277         xor %%eax, %%eax                                                \n\
278 2:      movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop             \n\
279         movntq %%mm0, (%%edi,%%eax,8)                                   \n\
280         inc %%eax                                                       \n\
281         cmp %%edx, %%eax                                                \n\
282         jb 2b                                                           \n\
283                                                                         \n\
284         # Finally, update pointers and count, and loop                  \n\
285         shl $3, %%edx           # EDX <- bytes copied                   \n\
286         add %%edx, %%esi                                                \n\
287         add %%edx, %%edi                                                \n\
288         sub %%edx, %%ecx                                                \n\
289         cmp $64, %%ecx          # At least one cache line left?         \n\
290         jae 0b                  # Yup, loop                             \n\
291                                                                         \n\
292 sse.memcpy_last:                                                        \n\
293         # Copy last <64 bytes, using the computed jump trick            \n\
294         mov %%ecx, %%eax        # EAX <- ECX>>2                         \n\
295         shr $2, %%eax                                                   \n"
296 #ifdef ACLIB_DISABLE_X86_TEXTRELS
297 "       xchg %%eax, %%ecx                                               \n\
298         repz movsd                                                      \n\
299         mov %%eax, %%ecx                                                \n"
300 #else
301 "       lea 0f, %%edx                                                   \n\
302         sub %%eax, %%edx                                                \n\
303         jmp *%%edx              # Execute 0-15 MOVSD's                  \n\
304         movsd                                                           \n\
305         movsd                                                           \n\
306         movsd                                                           \n\
307         movsd                                                           \n\
308         movsd                                                           \n\
309         movsd                                                           \n\
310         movsd                                                           \n\
311         movsd                                                           \n\
312         movsd                                                           \n\
313         movsd                                                           \n\
314         movsd                                                           \n\
315         movsd                                                           \n\
316         movsd                                                           \n\
317         movsd                                                           \n\
318         movsd                                                           \n"
319 #endif
320 "0:     and $3, %%ecx           # ECX <- ECX & 3                        \n"
321 #ifdef ACLIB_DISABLE_X86_TEXTRELS
322 "       repz movsb                                                      \n"
323 #else
324 "       lea sse.memcpy_end, %%edx                                       \n\
325         sub %%ecx, %%edx                                                \n\
326         jmp *%%edx              # Execute 0-3 MOVSB's                   \n\
327         movsb                                                           \n\
328         movsb                                                           \n\
329         movsb                                                           \n"
330 #endif
331 "                                                                       \n\
332 sse.memcpy_end:                                                         \n\
333         # All done!                                                     \n\
334         emms                    # Clean up after MMX instructions       \n\
335         sfence                  # Flush the write buffer                \n\
336         pop %%edi               # Restore destination (return value)    \n\
337         pop %%ebx               # Restore PIC register                  \n\
338     " : /* no outputs */
339       : "D" (dest), "S" (src), "c" (bytes)
340       : "%eax", "%edx"
341     );
342     return dest;
343 }
344 
345 #endif  /* HAVE_ASM_SSE && ARCH_X86 */
346 
347 /*************************************************************************/
348 
349 #if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
350 
351 /* AMD64-optimized routine, using SSE2.  Derived from AMD64 optimization
352  * guide section 5.13: Appropriate Memory Copying Routines.
353  * Nonstandard instructions used:
354  *     (CPUID.CMOVE) CMOVA
355  *     (CPUID.SSE2)  MOVDQA, MOVDQU, MOVNTDQ
356  *
357  * Note that this routine will also run more or less as-is (modulo register
358  * names and label(%%rip) references) on x86 CPUs, but tests have shown the
359  * SSE1 version above to be faster.
360  */
361 
362 /* The block copying code--macroized because we use two versions of it
363  * depending on whether the source is 16-byte-aligned or not.  Pass either
364  * movdqa or movdqu (unquoted) for the parameter. */
365 #define AMD64_BLOCK_MEMCPY(movdq) \
366 "       # First prefetch (note that if we end on an odd number of cache \n\
367         # lines, we skip prefetching the last one--faster that way than \n\
368         # prefetching line by line or treating it as a special case)    \n\
369 0:      mov %%ecx, %%edx        # EDX: temp counter (always <32 bits)   \n\
370         shr $6, %%edx           # Divide by cache line size (64 bytes)  \n\
371         cmp %%ebx, %%edx        # ... and cap at 128 (8192 bytes)       \n\
372         cmova %%ebx, %%edx                                              \n\
373         shl $3, %%edx           # EDX <- cache lines to copy * 8        \n\
374         mov %%edx, %%eax        # EAX <- cache lines to preload * 8     \n\
375                                 #        (also used as memory offset)   \n\
376 1:      prefetchnta -64(%%rsi,%%rax,8)  # Preload cache lines in pairs  \n\
377         prefetchnta -128(%%rsi,%%rax,8) # (going backwards)             \n\
378         sub $16, %%eax          # And loop                              \n\
379         jg 1b                                                           \n\
380                                                                         \n\
381         # Then copy--forward, which seems to be faster than reverse for \n\
382         # certain alignments                                            \n\
383         xor %%eax, %%eax                                                \n\
384 2:      " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop     \n\
385         movntdq %%xmm0, (%%rdi,%%rax,8)                                 \n\
386         add $2, %%eax                                                   \n\
387         cmp %%edx, %%eax                                                \n\
388         jb 2b                                                           \n\
389                                                                         \n\
390         # Finally, update pointers and count, and loop                  \n\
391         shl $3, %%edx           # EDX <- bytes copied                   \n\
392         add %%rdx, %%rsi                                                \n\
393         add %%rdx, %%rdi                                                \n\
394         sub %%rdx, %%rcx                                                \n\
395         cmp $64, %%rcx          # At least one cache line left?         \n\
396         jae 0b                  # Yup, loop                             \n"
397 
memcpy_amd64(void * dest,const void * src,size_t bytes)398 static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
399 {
400     asm("\
401         push %%rdi              # Save destination for return value     \n\
402         cld                     # MOVS* should ascend                   \n\
403                                                                         \n\
404         cmp $64, %%rcx          # Skip block copy for small blocks      \n\
405         jb amd64.memcpy_last                                            \n\
406                                                                         \n\
407         mov $128, %%ebx         # Constant used later                   \n\
408                                                                         \n\
409         # First align destination address to a multiple of 16 bytes     \n\
410         mov $8, %%eax           # EAX <- (8-dest) & 7                   \n\
411         sub %%edi, %%eax        # (we don't care about the top 32 bits) \n\
412         and $7, %%eax           # ... which is the number of bytes to copy\n\
413         lea 0f(%%rip), %%rdx    # Use a computed jump--faster than a loop\n\
414         sub %%rax, %%rdx                                                \n\
415         jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
416         movsb                                                           \n\
417         movsb                                                           \n\
418         movsb                                                           \n\
419         movsb                                                           \n\
420         movsb                                                           \n\
421         movsb                                                           \n\
422         movsb                                                           \n\
423 0:      sub %%rax, %%rcx        # Update count                          \n\
424         test $8, %%edi          # Is destination not 16-byte aligned?   \n\
425         je 1f                                                           \n\
426         movsq                   # Then move 8 bytes to align it         \n\
427         sub $8, %%rcx                                                   \n\
428                                                                         \n\
429 1:      cmp $0x38000, %%rcx     # Is this a large block? (0x38000 is an \n\
430                                 # arbitrary value where prefetching and \n\
431                                 # write combining seem to start becoming\n\
432                                 # faster)                               \n\
433         jb amd64.memcpy_small   # Nope, use small copy (no prefetch/WC) \n\
434         test $15, %%esi         # Is source also 16-byte aligned?       \n\
435                                 # (use ESI to save a REX prefix byte)   \n\
436         jnz amd64.memcpy_normal_bp # Nope, use slow copy                \n\
437         jmp amd64.memcpy_fast_bp # Yup, use fast copy                   \n\
438                                                                         \n\
439 amd64.memcpy_small:             # Small block copy routine--no prefetch \n\
440         mov %%ecx, %%edx        # EDX <- bytes to copy / 16             \n\
441         shr $4, %%edx           # (count known to fit in 32 bits)       \n\
442         mov %%edx, %%eax        # Leave remainder in ECX for later      \n\
443         shl $4, %%eax                                                   \n\
444         sub %%eax, %%ecx                                                \n\
445         .balign 16                                                      \n\
446 0:      movdqu (%%rsi), %%xmm0  # Copy 16 bytes of data                 \n\
447         movdqa %%xmm0, (%%rdi)                                          \n\
448         add $16, %%rsi          # Update pointers                       \n\
449         add $16, %%rdi                                                  \n\
450         dec %%edx               # And loop                              \n\
451         jnz 0b                                                          \n\
452         jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
453                                                                         \n\
454         .balign 16                                                      \n\
455         nop                                                             \n\
456         nop                                                             \n\
457 amd64.memcpy_fast_bp:           # Fast block prefetch loop              \n"
458 AMD64_BLOCK_MEMCPY(movdqa)
459 "       jmp amd64.memcpy_last   # Copy any remaining bytes              \n\
460                                                                         \n\
461         .balign 16                                                      \n\
462         nop                                                             \n\
463         nop                                                             \n\
464 amd64.memcpy_normal_bp:         # Normal (unaligned) block prefetch loop\n"
465 AMD64_BLOCK_MEMCPY(movdqu)
466 "                                                                       \n\
467 amd64.memcpy_last:                                                      \n\
468         # Copy last <64 bytes, using the computed jump trick            \n\
469         mov %%ecx, %%eax        # EAX <- ECX>>3                         \n\
470         shr $3, %%eax                                                   \n\
471         lea 0f(%%rip), %%rdx                                            \n\
472         add %%eax, %%eax        # Watch out, MOVSQ is 2 bytes!          \n\
473         sub %%rax, %%rdx                                                \n\
474         jmp *%%rdx              # Execute 0-7 MOVSQ's                   \n\
475         movsq                                                           \n\
476         movsq                                                           \n\
477         movsq                                                           \n\
478         movsq                                                           \n\
479         movsq                                                           \n\
480         movsq                                                           \n\
481         movsq                                                           \n\
482 0:      and $7, %%ecx           # ECX <- ECX & 7                        \n\
483         lea 0f(%%rip), %%rdx                                            \n\
484         sub %%rcx, %%rdx                                                \n\
485         jmp *%%rdx              # Execute 0-7 MOVSB's                   \n\
486         movsb                                                           \n\
487         movsb                                                           \n\
488         movsb                                                           \n\
489         movsb                                                           \n\
490         movsb                                                           \n\
491         movsb                                                           \n\
492         movsb                                                           \n\
493 0:                                                                      \n\
494         # All done!                                                     \n\
495         emms                    # Clean up after MMX instructions       \n\
496         sfence                  # Flush the write buffer                \n\
497         pop %%rdi               # Restore destination (return value)    \n\
498     " : /* no outputs */
499       : "D" (dest), "S" (src), "c" (bytes)
500       : "%rax", "%rbx", "%rdx"
501     );
502     return dest;
503 }
504 
505 #endif  /* HAVE_ASM_SSE2 && ARCH_X86_64 */
506 
507 /*************************************************************************/
508 
509 /* Initialization routine. */
510 
ac_memcpy_init(int accel)511 int ac_memcpy_init(int accel)
512 {
513     memcpy_ptr = memmove;
514 
515 #if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
516     if (HAS_ACCEL(accel, AC_MMX))
517         memcpy_ptr = memcpy_mmx;
518 #endif
519 
520 #if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
521     if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
522         memcpy_ptr = memcpy_sse;
523 #endif
524 
525 #if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
526     if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
527         memcpy_ptr = memcpy_amd64;
528 #endif
529 
530     return 1;
531 }
532 
533 /*************************************************************************/
534 
535 /*
536  * Local variables:
537  *   c-file-style: "stroustrup"
538  *   c-file-offsets: ((case-label . *) (statement-case-intro . *))
539  *   indent-tabs-mode: nil
540  * End:
541  *
542  * vim: expandtab shiftwidth=4:
543  */
544