1 /*
2 * memcpy.c - optimized memcpy() routines for aclib
3 * Written by Andrew Church <achurch@achurch.org>
4 *
5 * This file is part of transcode, a video stream processing tool.
6 * transcode is free software, distributable under the terms of the GNU
7 * General Public License (version 2 or later). See the file COPYING
8 * for details.
9 */
10
11 #include "ac.h"
12 #include "ac_internal.h"
13 #include <string.h>
14
15 /* Use memmove because memcpy isn't guaranteed to be ascending */
16 static void *(*memcpy_ptr)(void *, const void *, size_t) = memmove;
17
18 /*************************************************************************/
19
20 /* External interface */
21
ac_memcpy(void * dest,const void * src,size_t size)22 void *ac_memcpy(void *dest, const void *src, size_t size)
23 {
24 return (*memcpy_ptr)(dest, src, size);
25 }
26
27 /*************************************************************************/
28 /*************************************************************************/
29
30 /* Note the check for ARCH_X86 here: this is to prevent compilation of this
31 * code on x86_64, since all x86_64 processors support SSE2, and because
32 * this code is not set up to use the 64-bit registers for addressing on
33 * x86_64. */
34
35 #if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
36
37 /* MMX-optimized routine, intended for PMMX/PII processors.
38 * Nonstandard instructions used:
39 * (CPUID.MMX) MOVQ
40 */
41
memcpy_mmx(void * dest,const void * src,size_t bytes)42 static void *memcpy_mmx(void *dest, const void *src, size_t bytes)
43 {
44 asm("\
45 PENTIUM_LINE_SIZE = 32 # PMMX/PII cache line size \n\
46 PENTIUM_CACHE_SIZE = 8192 # PMMX/PII total cache size \n\
47 # Use only half because writes may touch the cache too (PII) \n\
48 PENTIUM_CACHE_BLOCK = (PENTIUM_CACHE_SIZE/2 - PENTIUM_LINE_SIZE) \n\
49 \n\
50 push %%ebx # Save PIC register \n\
51 push %%edi # Save destination for return value \n\
52 cld # MOVS* should ascend \n\
53 \n\
54 mov $64, %%ebx # Constant \n\
55 \n\
56 cmp %%ebx, %%ecx \n\
57 jb mmx.memcpy_last # Just use movs if <64 bytes \n\
58 \n\
59 # First align destination address to a multiple of 8 bytes \n\
60 mov $8, %%eax # EAX <- (8-dest) & 7 \n\
61 sub %%edi, %%eax \n\
62 and $7, %%eax # ... which is the number of bytes to copy\n"
63 #ifdef ACLIB_DISABLE_X86_TEXTRELS // Because "lea 0f" requires a textrel
64 " xchg %%eax, %%ecx \n\
65 mov %%ecx, %%edx \n\
66 repz movsb \n\
67 mov %%eax, %%ecx \n\
68 mov %%edx, %%eax \n"
69 #else
70 " lea 0f, %%edx # Use a computed jump--faster than a loop\n\
71 sub %%eax, %%edx \n\
72 jmp *%%edx # Execute 0-7 MOVSB's \n\
73 movsb \n\
74 movsb \n\
75 movsb \n\
76 movsb \n\
77 movsb \n\
78 movsb \n\
79 movsb \n"
80 #endif
81 "0: sub %%eax, %%ecx # Update count \n\
82 \n\
83 # Now copy data in blocks \n\
84 0: mov %%ecx, %%edx # EDX <- ECX >> 6 (cache lines to copy) \n\
85 shr $6, %%edx \n\
86 jz mmx.memcpy_last # <64 bytes left? Skip to end \n\
87 cmp $PENTIUM_CACHE_BLOCK/64, %%edx \n\
88 jb 1f # Limit size of block \n\
89 mov $PENTIUM_CACHE_BLOCK/64, %%edx \n\
90 1: mov %%edx, %%eax # EAX <- EDX << 6 (bytes to copy) \n\
91 shl $6, %%eax \n\
92 sub %%eax, %%ecx # Update remaining count \n\
93 add %%eax, %%esi # Point to end of region to be block-copied\n\
94 2: test %%eax, -32(%%esi) # Touch each cache line in reverse order\n\
95 test %%eax, -64(%%esi) \n\
96 sub %%ebx, %%esi # Update pointer \n\
97 sub %%ebx, %%eax # And loop \n\
98 jnz 2b \n\
99 # Note that ESI now points to the beginning of the block \n\
100 3: movq (%%esi), %%mm0 # Do the actual copy, 64 bytes at a time\n\
101 movq 8(%%esi), %%mm1 \n\
102 movq 16(%%esi), %%mm2 \n\
103 movq 24(%%esi), %%mm3 \n\
104 movq 32(%%esi), %%mm4 \n\
105 movq 40(%%esi), %%mm5 \n\
106 movq 48(%%esi), %%mm6 \n\
107 movq 56(%%esi), %%mm7 \n\
108 movq %%mm0, (%%edi) \n\
109 movq %%mm1, 8(%%edi) \n\
110 movq %%mm2, 16(%%edi) \n\
111 movq %%mm3, 24(%%edi) \n\
112 movq %%mm4, 32(%%edi) \n\
113 movq %%mm5, 40(%%edi) \n\
114 movq %%mm6, 48(%%edi) \n\
115 movq %%mm7, 56(%%edi) \n\
116 add %%ebx, %%esi # Update pointers \n\
117 add %%ebx, %%edi \n\
118 dec %%edx # And loop \n\
119 jnz 3b \n\
120 jmp 0b \n\
121 \n\
122 mmx.memcpy_last: \n\
123 # Copy last <64 bytes, using the computed jump trick \n\
124 mov %%ecx, %%eax # EAX <- ECX>>2 \n\
125 shr $2, %%eax \n"
126 #ifdef ACLIB_DISABLE_X86_TEXTRELS
127 " xchg %%eax, %%ecx \n\
128 repz movsd \n\
129 mov %%eax, %%ecx \n"
130 #else
131 " lea 0f, %%edx \n\
132 sub %%eax, %%edx \n\
133 jmp *%%edx # Execute 0-15 MOVSD's \n\
134 movsd \n\
135 movsd \n\
136 movsd \n\
137 movsd \n\
138 movsd \n\
139 movsd \n\
140 movsd \n\
141 movsd \n\
142 movsd \n\
143 movsd \n\
144 movsd \n\
145 movsd \n\
146 movsd \n\
147 movsd \n\
148 movsd \n"
149 #endif
150 "0: and $3, %%ecx # ECX <- ECX & 3 \n"
151 #ifdef ACLIB_DISABLE_X86_TEXTRELS
152 " repz movsb \n"
153 #else
154 " lea 0f, %%edx \n\
155 sub %%ecx, %%edx \n\
156 jmp *%%edx # Execute 0-3 MOVSB's \n\
157 movsb \n\
158 movsb \n\
159 movsb \n"
160 #endif
161 "0: \n\
162 # All done! \n\
163 emms # Clean up MMX state \n\
164 pop %%edi # Restore destination (return value) \n\
165 pop %%ebx # Restore PIC register \n\
166 " : /* no outputs */
167 : "D" (dest), "S" (src), "c" (bytes)
168 : "%eax", "%edx"
169 );
170 return dest;
171 }
172
173 #endif /* HAVE_ASM_MMX && ARCH_X86 */
174
175 /*************************************************************************/
176
177 #if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
178
179 /* SSE-optimized routine. Backported from AMD64 routine below.
180 * Nonstandard instructions used:
181 * (CPUID.CMOVE) CMOVA
182 * (CPUID.MMX) MOVQ
183 * (CPUID.SSE) MOVNTQ
184 */
185
memcpy_sse(void * dest,const void * src,size_t bytes)186 static void *memcpy_sse(void *dest, const void *src, size_t bytes)
187 {
188 asm("\
189 push %%ebx # Save PIC register \n\
190 push %%edi # Save destination for return value \n\
191 cld # MOVS* should ascend \n\
192 \n\
193 cmp $64, %%ecx # Skip block copy for small blocks \n\
194 jb sse.memcpy_last \n\
195 \n\
196 mov $128, %%ebx # Constant used later \n\
197 \n\
198 # First align destination address to a multiple of 8 bytes \n\
199 mov $8, %%eax # EAX <- (8-dest) & 7 \n\
200 sub %%edi, %%eax \n\
201 and $7, %%eax # ... which is the number of bytes to copy\n"
202 #ifdef ACLIB_DISABLE_X86_TEXTRELS
203 " xchg %%eax, %%ecx \n\
204 mov %%ecx, %%edx \n\
205 repz movsb \n\
206 mov %%eax, %%ecx \n\
207 mov %%edx, %%eax \n"
208 #else
209 " lea 0f, %%edx # Use a computed jump--faster than a loop\n\
210 sub %%eax, %%edx \n\
211 jmp *%%edx # Execute 0-7 MOVSB's \n\
212 movsb \n\
213 movsb \n\
214 movsb \n\
215 movsb \n\
216 movsb \n\
217 movsb \n\
218 movsb \n"
219 #endif
220 "0: sub %%eax, %%ecx # Update count \n\
221 \n\
222 cmp $0x10040, %%ecx # Is this a large block? (0x10040 is an \n\
223 # arbitrary value where prefetching and \n\
224 # write combining seem to start becoming\n\
225 # faster) \n\
226 jae sse.memcpy_bp # Yup, use prefetch copy \n\
227 \n\
228 sse.memcpy_small: # Small block copy routine--no prefetch \n"
229 #if 0
230 " mov %%ecx, %%edx # EDX <- bytes to copy / 8 \n\
231 shr $3, %%edx \n\
232 mov %%edx, %%eax # Leave remainder in ECX for later \n\
233 shl $3, %%eax \n\
234 sub %%eax, %%ecx \n\
235 .balign 16 \n\
236 0: movq (%%esi), %%mm0 # Copy 8 bytes of data \n\
237 movq %%mm0, (%%edi) \n\
238 add $8, %%esi # Update pointers \n\
239 add $8, %%edi \n\
240 dec %%edx # And loop \n\
241 jg 0b \n\
242 jmp sse.memcpy_last # Copy any remaining bytes \n\
243 \n\
244 nop # Align loops below \n"
245 #else
246 " # It appears that a simple rep movs is faster than cleverness \n\
247 # with movq... \n\
248 mov %%ecx, %%edx # EDX <- ECX & 3 \n\
249 and $3, %%edx \n\
250 shr $2, %%ecx # ECX <- ECX >> 2 \n\
251 rep movsl # Copy away! \n\
252 mov %%edx, %%ecx # Take care of last 0-3 bytes \n\
253 rep movsb \n\
254 jmp sse.memcpy_end # And exit \n\
255 \n\
256 .balign 16 \n\
257 nop \n\
258 nop \n"
259 #endif
260 "sse.memcpy_bp: # Block prefetch copy routine \n\
261 0: mov %%ecx, %%edx # EDX: temp counter \n\
262 shr $6, %%edx # Divide by cache line size (64 bytes) \n\
263 cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\
264 cmova %%ebx, %%edx \n\
265 shl $3, %%edx # EDX <- cache lines to copy * 8 \n\
266 mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\
267 # (also used as memory offset) \n\
268 1: test %%eax, -64(%%esi,%%eax,8) # Preload cache lines in pairs \n\
269 test %%eax, -128(%%esi,%%eax,8) # (going backwards) \n\
270 # (note that test %%eax,... seems to be faster than prefetchnta \n\
271 # on x86) \n\
272 sub $16, %%eax # And loop \n\
273 jg 1b \n\
274 \n\
275 # Then copy--forward, which seems to be faster than reverse for \n\
276 # certain alignments \n\
277 xor %%eax, %%eax \n\
278 2: movq (%%esi,%%eax,8), %%mm0 # Copy 8 bytes and loop \n\
279 movntq %%mm0, (%%edi,%%eax,8) \n\
280 inc %%eax \n\
281 cmp %%edx, %%eax \n\
282 jb 2b \n\
283 \n\
284 # Finally, update pointers and count, and loop \n\
285 shl $3, %%edx # EDX <- bytes copied \n\
286 add %%edx, %%esi \n\
287 add %%edx, %%edi \n\
288 sub %%edx, %%ecx \n\
289 cmp $64, %%ecx # At least one cache line left? \n\
290 jae 0b # Yup, loop \n\
291 \n\
292 sse.memcpy_last: \n\
293 # Copy last <64 bytes, using the computed jump trick \n\
294 mov %%ecx, %%eax # EAX <- ECX>>2 \n\
295 shr $2, %%eax \n"
296 #ifdef ACLIB_DISABLE_X86_TEXTRELS
297 " xchg %%eax, %%ecx \n\
298 repz movsd \n\
299 mov %%eax, %%ecx \n"
300 #else
301 " lea 0f, %%edx \n\
302 sub %%eax, %%edx \n\
303 jmp *%%edx # Execute 0-15 MOVSD's \n\
304 movsd \n\
305 movsd \n\
306 movsd \n\
307 movsd \n\
308 movsd \n\
309 movsd \n\
310 movsd \n\
311 movsd \n\
312 movsd \n\
313 movsd \n\
314 movsd \n\
315 movsd \n\
316 movsd \n\
317 movsd \n\
318 movsd \n"
319 #endif
320 "0: and $3, %%ecx # ECX <- ECX & 3 \n"
321 #ifdef ACLIB_DISABLE_X86_TEXTRELS
322 " repz movsb \n"
323 #else
324 " lea sse.memcpy_end, %%edx \n\
325 sub %%ecx, %%edx \n\
326 jmp *%%edx # Execute 0-3 MOVSB's \n\
327 movsb \n\
328 movsb \n\
329 movsb \n"
330 #endif
331 " \n\
332 sse.memcpy_end: \n\
333 # All done! \n\
334 emms # Clean up after MMX instructions \n\
335 sfence # Flush the write buffer \n\
336 pop %%edi # Restore destination (return value) \n\
337 pop %%ebx # Restore PIC register \n\
338 " : /* no outputs */
339 : "D" (dest), "S" (src), "c" (bytes)
340 : "%eax", "%edx"
341 );
342 return dest;
343 }
344
345 #endif /* HAVE_ASM_SSE && ARCH_X86 */
346
347 /*************************************************************************/
348
349 #if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
350
351 /* AMD64-optimized routine, using SSE2. Derived from AMD64 optimization
352 * guide section 5.13: Appropriate Memory Copying Routines.
353 * Nonstandard instructions used:
354 * (CPUID.CMOVE) CMOVA
355 * (CPUID.SSE2) MOVDQA, MOVDQU, MOVNTDQ
356 *
357 * Note that this routine will also run more or less as-is (modulo register
358 * names and label(%%rip) references) on x86 CPUs, but tests have shown the
359 * SSE1 version above to be faster.
360 */
361
362 /* The block copying code--macroized because we use two versions of it
363 * depending on whether the source is 16-byte-aligned or not. Pass either
364 * movdqa or movdqu (unquoted) for the parameter. */
365 #define AMD64_BLOCK_MEMCPY(movdq) \
366 " # First prefetch (note that if we end on an odd number of cache \n\
367 # lines, we skip prefetching the last one--faster that way than \n\
368 # prefetching line by line or treating it as a special case) \n\
369 0: mov %%ecx, %%edx # EDX: temp counter (always <32 bits) \n\
370 shr $6, %%edx # Divide by cache line size (64 bytes) \n\
371 cmp %%ebx, %%edx # ... and cap at 128 (8192 bytes) \n\
372 cmova %%ebx, %%edx \n\
373 shl $3, %%edx # EDX <- cache lines to copy * 8 \n\
374 mov %%edx, %%eax # EAX <- cache lines to preload * 8 \n\
375 # (also used as memory offset) \n\
376 1: prefetchnta -64(%%rsi,%%rax,8) # Preload cache lines in pairs \n\
377 prefetchnta -128(%%rsi,%%rax,8) # (going backwards) \n\
378 sub $16, %%eax # And loop \n\
379 jg 1b \n\
380 \n\
381 # Then copy--forward, which seems to be faster than reverse for \n\
382 # certain alignments \n\
383 xor %%eax, %%eax \n\
384 2: " #movdq " (%%rsi,%%rax,8), %%xmm0 # Copy 16 bytes and loop \n\
385 movntdq %%xmm0, (%%rdi,%%rax,8) \n\
386 add $2, %%eax \n\
387 cmp %%edx, %%eax \n\
388 jb 2b \n\
389 \n\
390 # Finally, update pointers and count, and loop \n\
391 shl $3, %%edx # EDX <- bytes copied \n\
392 add %%rdx, %%rsi \n\
393 add %%rdx, %%rdi \n\
394 sub %%rdx, %%rcx \n\
395 cmp $64, %%rcx # At least one cache line left? \n\
396 jae 0b # Yup, loop \n"
397
memcpy_amd64(void * dest,const void * src,size_t bytes)398 static void *memcpy_amd64(void *dest, const void *src, size_t bytes)
399 {
400 asm("\
401 push %%rdi # Save destination for return value \n\
402 cld # MOVS* should ascend \n\
403 \n\
404 cmp $64, %%rcx # Skip block copy for small blocks \n\
405 jb amd64.memcpy_last \n\
406 \n\
407 mov $128, %%ebx # Constant used later \n\
408 \n\
409 # First align destination address to a multiple of 16 bytes \n\
410 mov $8, %%eax # EAX <- (8-dest) & 7 \n\
411 sub %%edi, %%eax # (we don't care about the top 32 bits) \n\
412 and $7, %%eax # ... which is the number of bytes to copy\n\
413 lea 0f(%%rip), %%rdx # Use a computed jump--faster than a loop\n\
414 sub %%rax, %%rdx \n\
415 jmp *%%rdx # Execute 0-7 MOVSB's \n\
416 movsb \n\
417 movsb \n\
418 movsb \n\
419 movsb \n\
420 movsb \n\
421 movsb \n\
422 movsb \n\
423 0: sub %%rax, %%rcx # Update count \n\
424 test $8, %%edi # Is destination not 16-byte aligned? \n\
425 je 1f \n\
426 movsq # Then move 8 bytes to align it \n\
427 sub $8, %%rcx \n\
428 \n\
429 1: cmp $0x38000, %%rcx # Is this a large block? (0x38000 is an \n\
430 # arbitrary value where prefetching and \n\
431 # write combining seem to start becoming\n\
432 # faster) \n\
433 jb amd64.memcpy_small # Nope, use small copy (no prefetch/WC) \n\
434 test $15, %%esi # Is source also 16-byte aligned? \n\
435 # (use ESI to save a REX prefix byte) \n\
436 jnz amd64.memcpy_normal_bp # Nope, use slow copy \n\
437 jmp amd64.memcpy_fast_bp # Yup, use fast copy \n\
438 \n\
439 amd64.memcpy_small: # Small block copy routine--no prefetch \n\
440 mov %%ecx, %%edx # EDX <- bytes to copy / 16 \n\
441 shr $4, %%edx # (count known to fit in 32 bits) \n\
442 mov %%edx, %%eax # Leave remainder in ECX for later \n\
443 shl $4, %%eax \n\
444 sub %%eax, %%ecx \n\
445 .balign 16 \n\
446 0: movdqu (%%rsi), %%xmm0 # Copy 16 bytes of data \n\
447 movdqa %%xmm0, (%%rdi) \n\
448 add $16, %%rsi # Update pointers \n\
449 add $16, %%rdi \n\
450 dec %%edx # And loop \n\
451 jnz 0b \n\
452 jmp amd64.memcpy_last # Copy any remaining bytes \n\
453 \n\
454 .balign 16 \n\
455 nop \n\
456 nop \n\
457 amd64.memcpy_fast_bp: # Fast block prefetch loop \n"
458 AMD64_BLOCK_MEMCPY(movdqa)
459 " jmp amd64.memcpy_last # Copy any remaining bytes \n\
460 \n\
461 .balign 16 \n\
462 nop \n\
463 nop \n\
464 amd64.memcpy_normal_bp: # Normal (unaligned) block prefetch loop\n"
465 AMD64_BLOCK_MEMCPY(movdqu)
466 " \n\
467 amd64.memcpy_last: \n\
468 # Copy last <64 bytes, using the computed jump trick \n\
469 mov %%ecx, %%eax # EAX <- ECX>>3 \n\
470 shr $3, %%eax \n\
471 lea 0f(%%rip), %%rdx \n\
472 add %%eax, %%eax # Watch out, MOVSQ is 2 bytes! \n\
473 sub %%rax, %%rdx \n\
474 jmp *%%rdx # Execute 0-7 MOVSQ's \n\
475 movsq \n\
476 movsq \n\
477 movsq \n\
478 movsq \n\
479 movsq \n\
480 movsq \n\
481 movsq \n\
482 0: and $7, %%ecx # ECX <- ECX & 7 \n\
483 lea 0f(%%rip), %%rdx \n\
484 sub %%rcx, %%rdx \n\
485 jmp *%%rdx # Execute 0-7 MOVSB's \n\
486 movsb \n\
487 movsb \n\
488 movsb \n\
489 movsb \n\
490 movsb \n\
491 movsb \n\
492 movsb \n\
493 0: \n\
494 # All done! \n\
495 emms # Clean up after MMX instructions \n\
496 sfence # Flush the write buffer \n\
497 pop %%rdi # Restore destination (return value) \n\
498 " : /* no outputs */
499 : "D" (dest), "S" (src), "c" (bytes)
500 : "%rax", "%rbx", "%rdx"
501 );
502 return dest;
503 }
504
505 #endif /* HAVE_ASM_SSE2 && ARCH_X86_64 */
506
507 /*************************************************************************/
508
509 /* Initialization routine. */
510
ac_memcpy_init(int accel)511 int ac_memcpy_init(int accel)
512 {
513 memcpy_ptr = memmove;
514
515 #if defined(HAVE_ASM_MMX) && defined(ARCH_X86)
516 if (HAS_ACCEL(accel, AC_MMX))
517 memcpy_ptr = memcpy_mmx;
518 #endif
519
520 #if defined(HAVE_ASM_SSE) && defined(ARCH_X86)
521 if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE))
522 memcpy_ptr = memcpy_sse;
523 #endif
524
525 #if defined(HAVE_ASM_SSE2) && defined(ARCH_X86_64)
526 if (HAS_ACCEL(accel, AC_CMOVE|AC_SSE2))
527 memcpy_ptr = memcpy_amd64;
528 #endif
529
530 return 1;
531 }
532
533 /*************************************************************************/
534
535 /*
536 * Local variables:
537 * c-file-style: "stroustrup"
538 * c-file-offsets: ((case-label . *) (statement-case-intro . *))
539 * indent-tabs-mode: nil
540 * End:
541 *
542 * vim: expandtab shiftwidth=4:
543 */
544