1 // Copyright (C) 2004 Id Software, Inc.
2 //
3
4 //===============================================================
5 //
6 // 3DNow! implementation of idSIMDProcessor
7 //
8 //===============================================================
9
10 #ifdef HAVE_CONFIG_H
11 #include "config.h"
12 #endif
13
14 #ifdef _WIN32
15 #ifdef SIMD_INSTRUCTIONS
16 #define WIN32_LEAN_AND_MEAN
17 #include <windows.h>
18
19 #include <stdio.h>
20
21 #include "doomtype.h"
22 #include "m_argv.h"
23 #include "SDL_cpuinfo.h"
24 #include "i_simd.h"
25
26 memcpy_fast_f memcpy_fast;
27 memset_fast_f memset_fast;
28
29 static void* memcpy_MMX( void *dst, const void *src, size_t count );
30 static void* memset_MMX( void *dst, int val, size_t count );
31 static void* memcpy_3DNow( void *dst, const void *src, size_t count );
32
I_InitSIMD(void)33 void I_InitSIMD(void)
34 {
35 memcpy_fast = memcpy;
36 memset_fast = memset;
37
38 if (!M_CheckParm("-nosimd"))
39 {
40 if (SDL_Has3DNow() && !M_CheckParm("-no3dnow"))
41 {
42 memcpy_fast = memcpy_3DNow;
43 fprintf(stdout, "I_Init: using MMX and 3DNow! for SIMD processing\n");
44 }
45 else
46 {
47 if (SDL_HasMMX() && !M_CheckParm("-nommx"))
48 {
49 memcpy_fast = memcpy_MMX;
50 memset_fast = memset_MMX;
51 fprintf(stdout, "I_Init: using MMX for SIMD processing\n");
52 }
53 }
54 }
55 }
56
57 #define EMMS_INSTRUCTION __asm emms
58 #if _MSC_VER > 1300
59 #define PREFETCH(a) prefetchnta a
60 #define MOVNTQ movntq
61 #define SFENCE sfence
62 #else
63 #define PREFETCH(a)
64 #define MOVNTQ movq
65 #define SFENCE
66 #endif
67
68 static void* memcpy_MMX( void *dst, const void *src, size_t count );
69 static void* memset_MMX( void *dst, int val, size_t count );
70 static void* memcpy_3DNow( void *dst, const void *src, size_t count );
71
72 // Very optimized memcpy() routine for all AMD Athlon and Duron family.
73 // This code uses any of FOUR different basic copy methods, depending
74 // on the transfer size.
75 // NOTE: Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
76 // "Streaming Store"), and also uses the software prefetchnta instructions,
77 // be sure you're running on Athlon/Duron or other recent CPU before calling!
78
79 #define TINY_BLOCK_COPY 64 // upper limit for movsd type copy
80 // The smallest copy uses the X86 "movsd" instruction, in an optimized
81 // form which is an "unrolled loop".
82
83 #define IN_CACHE_COPY 64 * 1024 // upper limit for movq/movq copy w/SW prefetch
84 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
85 // also using the "unrolled loop" optimization. This code uses
86 // the software prefetch instruction to get the data into the cache.
87
88 #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
89 // For larger blocks, which will spill beyond the cache, it's faster to
90 // use the Streaming Store instruction MOVNTQ. This write instruction
91 // bypasses the cache and writes straight to main memory. This code also
92 // uses the software prefetch instruction to pre-read the data.
93 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
94
95 #define BLOCK_PREFETCH_COPY infinity // no limit for movq/movntq w/block prefetch
96 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
97 // For the largest size blocks, a special technique called Block Prefetch
98 // can be used to accelerate the read operations. Block Prefetch reads
99 // one address per cache line, for a series of cache lines, in a short loop.
100 // This is faster than using software prefetch. The technique is great for
101 // getting maximum read bandwidth, especially in DDR memory systems.
102
103 /*
104 ================
105 idSIMD_3DNow::Memcpy
106
107 optimized memory copy routine that handles all alignment cases and block sizes efficiently
108 ================
109 */
memcpy_3DNow(void * dest,const void * src,size_t n)110 static void* memcpy_3DNow( void *dest, const void *src, size_t n ) {
111 __asm {
112
113 mov ecx, [n] // number of bytes to copy
114 mov edi, [dest] // destination
115 mov esi, [src] // source
116 mov ebx, ecx // keep a copy of count
117
118 cld
119 cmp ecx, TINY_BLOCK_COPY
120 jb $memcpy_ic_3 // tiny? skip mmx copy
121
122 cmp ecx, 32*1024 // don't align between 32k-64k because
123 jbe $memcpy_do_align // it appears to be slower
124 cmp ecx, 64*1024
125 jbe $memcpy_align_done
126 $memcpy_do_align:
127 mov ecx, 8 // a trick that's faster than rep movsb...
128 sub ecx, edi // align destination to qword
129 and ecx, 111b // get the low bits
130 sub ebx, ecx // update copy count
131 neg ecx // set up to jump into the array
132 add ecx, offset $memcpy_align_done
133 jmp ecx // jump to array of movsb's
134
135 align 4
136 movsb
137 movsb
138 movsb
139 movsb
140 movsb
141 movsb
142 movsb
143 movsb
144
145 $memcpy_align_done: // destination is dword aligned
146 mov ecx, ebx // number of bytes left to copy
147 shr ecx, 6 // get 64-byte block count
148 jz $memcpy_ic_2 // finish the last few bytes
149
150 cmp ecx, IN_CACHE_COPY/64 // too big 4 cache? use uncached copy
151 jae $memcpy_uc_test
152
153 // This is small block copy that uses the MMX registers to copy 8 bytes
154 // at a time. It uses the "unrolled loop" optimization, and also uses
155 // the software prefetch instruction to get the data into the cache.
156 align 16
157 $memcpy_ic_1: // 64-byte block copies, in-cache copy
158
159 PREFETCH([esi + (200*64/34+192)]) // start reading ahead
160
161 movq mm0, [esi+0] // read 64 bits
162 movq mm1, [esi+8]
163 movq [edi+0], mm0 // write 64 bits
164 movq [edi+8], mm1 // note: the normal movq writes the
165 movq mm2, [esi+16] // data to cache; a cache line will be
166 movq mm3, [esi+24] // allocated as needed, to store the data
167 movq [edi+16], mm2
168 movq [edi+24], mm3
169 movq mm0, [esi+32]
170 movq mm1, [esi+40]
171 movq [edi+32], mm0
172 movq [edi+40], mm1
173 movq mm2, [esi+48]
174 movq mm3, [esi+56]
175 movq [edi+48], mm2
176 movq [edi+56], mm3
177
178 add esi, 64 // update source pointer
179 add edi, 64 // update destination pointer
180 dec ecx // count down
181 jnz $memcpy_ic_1 // last 64-byte block?
182
183 $memcpy_ic_2:
184 mov ecx, ebx // has valid low 6 bits of the byte count
185 $memcpy_ic_3:
186 shr ecx, 2 // dword count
187 and ecx, 1111b // only look at the "remainder" bits
188 neg ecx // set up to jump into the array
189 add ecx, offset $memcpy_last_few
190 jmp ecx // jump to array of movsd's
191
192 $memcpy_uc_test:
193 cmp ecx, UNCACHED_COPY/64 // big enough? use block prefetch copy
194 jae $memcpy_bp_1
195
196 $memcpy_64_test:
197 or ecx, ecx // tail end of block prefetch will jump here
198 jz $memcpy_ic_2 // no more 64-byte blocks left
199
200 // For larger blocks, which will spill beyond the cache, it's faster to
201 // use the Streaming Store instruction MOVNTQ. This write instruction
202 // bypasses the cache and writes straight to main memory. This code also
203 // uses the software prefetch instruction to pre-read the data.
204 align 16
205 $memcpy_uc_1: // 64-byte blocks, uncached copy
206
207 PREFETCH ([esi + (200*64/34+192)]) // start reading ahead
208
209 movq mm0,[esi+0] // read 64 bits
210 add edi,64 // update destination pointer
211 movq mm1,[esi+8]
212 add esi,64 // update source pointer
213 movq mm2,[esi-48]
214 MOVNTQ [edi-64], mm0 // write 64 bits, bypassing the cache
215 movq mm0,[esi-40] // note: movntq also prevents the CPU
216 MOVNTQ [edi-56], mm1 // from READING the destination address
217 movq mm1,[esi-32] // into the cache, only to be over-written
218 MOVNTQ [edi-48], mm2 // so that also helps performance
219 movq mm2,[esi-24]
220 MOVNTQ [edi-40], mm0
221 movq mm0,[esi-16]
222 MOVNTQ [edi-32], mm1
223 movq mm1,[esi-8]
224 MOVNTQ [edi-24], mm2
225 MOVNTQ [edi-16], mm0
226 dec ecx
227 MOVNTQ [edi-8], mm1
228 jnz $memcpy_uc_1 // last 64-byte block?
229
230 jmp $memcpy_ic_2 // almost done
231
232 // For the largest size blocks, a special technique called Block Prefetch
233 // can be used to accelerate the read operations. Block Prefetch reads
234 // one address per cache line, for a series of cache lines, in a short loop.
235 // This is faster than using software prefetch, in this case.
236 // The technique is great for getting maximum read bandwidth,
237 // especially in DDR memory systems.
238 $memcpy_bp_1: // large blocks, block prefetch copy
239
240 cmp ecx, CACHEBLOCK // big enough to run another prefetch loop?
241 jl $memcpy_64_test // no, back to regular uncached copy
242
243 mov eax, CACHEBLOCK / 2 // block prefetch loop, unrolled 2X
244 add esi, CACHEBLOCK * 64 // move to the top of the block
245 align 16
246 $memcpy_bp_2:
247 mov edx, [esi-64] // grab one address per cache line
248 mov edx, [esi-128] // grab one address per cache line
249 sub esi, 128 // go reverse order
250 dec eax // count down the cache lines
251 jnz $memcpy_bp_2 // keep grabbing more lines into cache
252
253 mov eax, CACHEBLOCK // now that it's in cache, do the copy
254 align 16
255 $memcpy_bp_3:
256 movq mm0, [esi ] // read 64 bits
257 movq mm1, [esi+ 8]
258 movq mm2, [esi+16]
259 movq mm3, [esi+24]
260 movq mm4, [esi+32]
261 movq mm5, [esi+40]
262 movq mm6, [esi+48]
263 movq mm7, [esi+56]
264 add esi, 64 // update source pointer
265 MOVNTQ [edi ], mm0 // write 64 bits, bypassing cache
266 MOVNTQ [edi+ 8], mm1 // note: movntq also prevents the CPU
267 MOVNTQ [edi+16], mm2 // from READING the destination address
268 MOVNTQ [edi+24], mm3 // into the cache, only to be over-written,
269 MOVNTQ [edi+32], mm4 // so that also helps performance
270 MOVNTQ [edi+40], mm5
271 MOVNTQ [edi+48], mm6
272 MOVNTQ [edi+56], mm7
273 add edi, 64 // update dest pointer
274
275 dec eax // count down
276
277 jnz $memcpy_bp_3 // keep copying
278 sub ecx, CACHEBLOCK // update the 64-byte block count
279 jmp $memcpy_bp_1 // keep processing chunks
280
281 // The smallest copy uses the X86 "movsd" instruction, in an optimized
282 // form which is an "unrolled loop". Then it handles the last few bytes.
283 align 4
284 movsd
285 movsd // perform last 1-15 dword copies
286 movsd
287 movsd
288 movsd
289 movsd
290 movsd
291 movsd
292 movsd
293 movsd // perform last 1-7 dword copies
294 movsd
295 movsd
296 movsd
297 movsd
298 movsd
299 movsd
300
301 $memcpy_last_few: // dword aligned from before movsd's
302 mov ecx, ebx // has valid low 2 bits of the byte count
303 and ecx, 11b // the last few cows must come home
304 jz $memcpy_final // no more, let's leave
305 rep movsb // the last 1, 2, or 3 bytes
306
307 $memcpy_final:
308 emms // clean up the MMX state
309 SFENCE // flush the write buffer
310 mov eax, [dest] // ret value = destination pointer
311
312 }
313 return dest;
314 }
315
316 /*
317 ================
318 MMX_Memcpy8B
319 ================
320 */
MMX_Memcpy8B(void * dest,const void * src,const int count)321 static void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
322 _asm {
323 mov esi, src
324 mov edi, dest
325 mov ecx, count
326 shr ecx, 3 // 8 bytes per iteration
327
328 loop1:
329 movq mm1, 0[ESI] // Read in source data
330 MOVNTQ 0[EDI], mm1 // Non-temporal stores
331
332 add esi, 8
333 add edi, 8
334 dec ecx
335 jnz loop1
336
337 }
338 EMMS_INSTRUCTION
339 }
340
341 /*
342 ================
343 MMX_Memcpy64B
344
345 165MB/sec
346 ================
347 */
MMX_Memcpy64B(void * dest,const void * src,const int count)348 static void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
349 _asm {
350 mov esi, src
351 mov edi, dest
352 mov ecx, count
353 shr ecx, 6 // 64 bytes per iteration
354
355 loop1:
356 PREFETCH (64[ESI]) // Prefetch next loop, non-temporal
357 PREFETCH (96[ESI])
358
359 movq mm1, 0[ESI] // Read in source data
360 movq mm2, 8[ESI]
361 movq mm3, 16[ESI]
362 movq mm4, 24[ESI]
363 movq mm5, 32[ESI]
364 movq mm6, 40[ESI]
365 movq mm7, 48[ESI]
366 movq mm0, 56[ESI]
367
368 MOVNTQ 0[EDI], mm1 // Non-temporal stores
369 MOVNTQ 8[EDI], mm2
370 MOVNTQ 16[EDI], mm3
371 MOVNTQ 24[EDI], mm4
372 MOVNTQ 32[EDI], mm5
373 MOVNTQ 40[EDI], mm6
374 MOVNTQ 48[EDI], mm7
375 MOVNTQ 56[EDI], mm0
376
377 add esi, 64
378 add edi, 64
379 dec ecx
380 jnz loop1
381 }
382 EMMS_INSTRUCTION
383 }
384
385 /*
386 ================
387 MMX_Memcpy2kB
388
389 240MB/sec
390 ================
391 */
392 #define _alloca16( x ) ((void *)((((int)_alloca( (x)+15 )) + 15) & ~15))
393
MMX_Memcpy2kB(void * dest,const void * src,const int count)394 static void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
395 byte *tbuf = (byte *)_alloca16(2048);
396 __asm {
397 push ebx
398 mov esi, src
399 mov ebx, count
400 shr ebx, 11 // 2048 bytes at a time
401 mov edi, dest
402
403 loop2k:
404 push edi // copy 2k into temporary buffer
405 mov edi, tbuf
406 mov ecx, 32
407
408 loopMemToL1:
409 PREFETCH (64[ESI]) // Prefetch next loop, non-temporal
410 PREFETCH (96[ESI])
411
412 movq mm1, 0[ESI] // Read in source data
413 movq mm2, 8[ESI]
414 movq mm3, 16[ESI]
415 movq mm4, 24[ESI]
416 movq mm5, 32[ESI]
417 movq mm6, 40[ESI]
418 movq mm7, 48[ESI]
419 movq mm0, 56[ESI]
420
421 movq 0[EDI], mm1 // Store into L1
422 movq 8[EDI], mm2
423 movq 16[EDI], mm3
424 movq 24[EDI], mm4
425 movq 32[EDI], mm5
426 movq 40[EDI], mm6
427 movq 48[EDI], mm7
428 movq 56[EDI], mm0
429 add esi, 64
430 add edi, 64
431 dec ecx
432 jnz loopMemToL1
433
434 pop edi // Now copy from L1 to system memory
435 push esi
436 mov esi, tbuf
437 mov ecx, 32
438
439 loopL1ToMem:
440 movq mm1, 0[ESI] // Read in source data from L1
441 movq mm2, 8[ESI]
442 movq mm3, 16[ESI]
443 movq mm4, 24[ESI]
444 movq mm5, 32[ESI]
445 movq mm6, 40[ESI]
446 movq mm7, 48[ESI]
447 movq mm0, 56[ESI]
448
449 MOVNTQ 0[EDI], mm1 // Non-temporal stores
450 MOVNTQ 8[EDI], mm2
451 MOVNTQ 16[EDI], mm3
452 MOVNTQ 24[EDI], mm4
453 MOVNTQ 32[EDI], mm5
454 MOVNTQ 40[EDI], mm6
455 MOVNTQ 48[EDI], mm7
456 MOVNTQ 56[EDI], mm0
457
458 add esi, 64
459 add edi, 64
460 dec ecx
461 jnz loopL1ToMem
462
463 pop esi // Do next 2k block
464 dec ebx
465 jnz loop2k
466 pop ebx
467 }
468 EMMS_INSTRUCTION
469 }
470
471
472 /*
473 ================
474 idSIMD_MMX::Memcpy
475
476 optimized memory copy routine that handles all alignment cases and block sizes efficiently
477 ================
478 */
memcpy_MMX(void * dest0,const void * src0,size_t count0)479 static void* memcpy_MMX( void *dest0, const void *src0, size_t count0 ) {
480 // if copying more than 16 bytes and we can copy 8 byte aligned
481 if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
482 byte *dest = (byte *)dest0;
483 byte *src = (byte *)src0;
484
485 // copy up to the first 8 byte aligned boundary
486 int count = ((int)dest) & 7;
487 memcpy( dest, src, count );
488 dest += count;
489 src += count;
490 count = count0 - count;
491
492 // if there are multiple blocks of 2kB
493 if ( count & ~4095 ) {
494 MMX_Memcpy2kB( dest, src, count );
495 src += (count & ~2047);
496 dest += (count & ~2047);
497 count &= 2047;
498 }
499
500 // if there are blocks of 64 bytes
501 if ( count & ~63 ) {
502 MMX_Memcpy64B( dest, src, count );
503 src += (count & ~63);
504 dest += (count & ~63);
505 count &= 63;
506 }
507
508 // if there are blocks of 8 bytes
509 if ( count & ~7 ) {
510 MMX_Memcpy8B( dest, src, count );
511 src += (count & ~7);
512 dest += (count & ~7);
513 count &= 7;
514 }
515
516 // copy any remaining bytes
517 memcpy( dest, src, count );
518 } else {
519 // use the regular one if we cannot copy 8 byte aligned
520 memcpy( dest0, src0, count0 );
521 }
522 return dest0;
523 }
524
525 /*
526 ================
527 idSIMD_MMX::Memset
528 ================
529 */
memset_MMX(void * dest0,int val,size_t count0)530 static void* memset_MMX( void* dest0, int val, size_t count0 ) {
531 union {
532 byte bytes[8];
533 unsigned short words[4];
534 unsigned int dwords[2];
535 } dat;
536
537 byte *dest = (byte *)dest0;
538 int count = count0;
539
540 while( count > 0 && (((int)dest) & 7) ) {
541 *dest = val;
542 dest++;
543 count--;
544 }
545 if ( !count ) {
546 return dest0;
547 }
548
549 dat.bytes[0] = val;
550 dat.bytes[1] = val;
551 dat.words[1] = dat.words[0];
552 dat.dwords[1] = dat.dwords[0];
553
554 if ( count >= 64 ) {
555 __asm {
556 mov edi, dest
557 mov ecx, count
558 shr ecx, 6 // 64 bytes per iteration
559 movq mm1, dat // Read in source data
560 movq mm2, mm1
561 movq mm3, mm1
562 movq mm4, mm1
563 movq mm5, mm1
564 movq mm6, mm1
565 movq mm7, mm1
566 movq mm0, mm1
567 loop1:
568 MOVNTQ 0[EDI], mm1 // Non-temporal stores
569 MOVNTQ 8[EDI], mm2
570 MOVNTQ 16[EDI], mm3
571 MOVNTQ 24[EDI], mm4
572 MOVNTQ 32[EDI], mm5
573 MOVNTQ 40[EDI], mm6
574 MOVNTQ 48[EDI], mm7
575 MOVNTQ 56[EDI], mm0
576
577 add edi, 64
578 dec ecx
579 jnz loop1
580 }
581 dest += ( count & ~63 );
582 count &= 63;
583 }
584
585 if ( count >= 8 ) {
586 __asm {
587 mov edi, dest
588 mov ecx, count
589 shr ecx, 3 // 8 bytes per iteration
590 movq mm1, dat // Read in source data
591 loop2:
592 MOVNTQ 0[EDI], mm1 // Non-temporal stores
593
594 add edi, 8
595 dec ecx
596 jnz loop2
597 }
598 dest += (count & ~7);
599 count &= 7;
600 }
601
602 while( count > 0 ) {
603 *dest = val;
604 dest++;
605 count--;
606 }
607
608 EMMS_INSTRUCTION
609
610 return dest0;
611 }
612 #endif // SIMD_INSTRUCTIONS
613 #endif // _WIN32
614