1 // Copyright (C) 2004 Id Software, Inc.
2 //
3 
4 //===============================================================
5 //
6 //	3DNow! implementation of idSIMDProcessor
7 //
8 //===============================================================
9 
10 #ifdef HAVE_CONFIG_H
11 #include "config.h"
12 #endif
13 
14 #ifdef _WIN32
15 #ifdef SIMD_INSTRUCTIONS
16 #define WIN32_LEAN_AND_MEAN
17 #include <windows.h>
18 
19 #include <stdio.h>
20 
21 #include "doomtype.h"
22 #include "m_argv.h"
23 #include "SDL_cpuinfo.h"
24 #include "i_simd.h"
25 
26 memcpy_fast_f memcpy_fast;
27 memset_fast_f memset_fast;
28 
29 static void* memcpy_MMX( void *dst, const void *src, size_t count );
30 static void* memset_MMX( void *dst, int val, size_t count );
31 static void* memcpy_3DNow( void *dst, const void *src, size_t count );
32 
I_InitSIMD(void)33 void I_InitSIMD(void)
34 {
35   memcpy_fast = memcpy;
36   memset_fast = memset;
37 
38   if (!M_CheckParm("-nosimd"))
39   {
40     if (SDL_Has3DNow() && !M_CheckParm("-no3dnow"))
41     {
42       memcpy_fast = memcpy_3DNow;
43       fprintf(stdout, "I_Init: using MMX and 3DNow! for SIMD processing\n");
44     }
45     else
46     {
47       if (SDL_HasMMX() && !M_CheckParm("-nommx"))
48       {
49         memcpy_fast = memcpy_MMX;
50         memset_fast = memset_MMX;
51         fprintf(stdout, "I_Init: using MMX for SIMD processing\n");
52       }
53     }
54   }
55 }
56 
57 #define EMMS_INSTRUCTION __asm emms
58 #if _MSC_VER > 1300
59 #define PREFETCH(a) prefetchnta a
60 #define MOVNTQ movntq
61 #define SFENCE sfence
62 #else
63 #define PREFETCH(a)
64 #define MOVNTQ movq
65 #define SFENCE
66 #endif
67 
68 static void* memcpy_MMX( void *dst, const void *src, size_t count );
69 static void* memset_MMX( void *dst, int val, size_t count );
70 static void* memcpy_3DNow( void *dst, const void *src, size_t count );
71 
72 // Very optimized memcpy() routine for all AMD Athlon and Duron family.
73 // This code uses any of FOUR different basic copy methods, depending
74 // on the transfer size.
75 // NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
76 // "Streaming Store"), and also uses the software prefetchnta instructions,
77 // be sure you're running on Athlon/Duron or other recent CPU before calling!
78 
79 #define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
80 // The smallest copy uses the X86 "movsd" instruction, in an optimized
81 // form which is an "unrolled loop".
82 
83 #define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
84 // Next is a copy that uses the MMX registers to copy 8 bytes at a time,
85 // also using the "unrolled loop" optimization.   This code uses
86 // the software prefetch instruction to get the data into the cache.
87 
88 #define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
89 // For larger blocks, which will spill beyond the cache, it's faster to
90 // use the Streaming Store instruction MOVNTQ.   This write instruction
91 // bypasses the cache and writes straight to main memory.  This code also
92 // uses the software prefetch instruction to pre-read the data.
93 // USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"
94 
95 #define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
96 #define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
97 // For the largest size blocks, a special technique called Block Prefetch
98 // can be used to accelerate the read operations.   Block Prefetch reads
99 // one address per cache line, for a series of cache lines, in a short loop.
100 // This is faster than using software prefetch.  The technique is great for
101 // getting maximum read bandwidth, especially in DDR memory systems.
102 
103 /*
104 ================
105 idSIMD_3DNow::Memcpy
106 
107   optimized memory copy routine that handles all alignment cases and block sizes efficiently
108 ================
109 */
memcpy_3DNow(void * dest,const void * src,size_t n)110 static void* memcpy_3DNow( void *dest, const void *src, size_t n ) {
111   __asm {
112 
113 	mov		ecx, [n]					// number of bytes to copy
114 	mov		edi, [dest]					// destination
115 	mov		esi, [src]					// source
116 	mov		ebx, ecx					// keep a copy of count
117 
118 	cld
119 	cmp		ecx, TINY_BLOCK_COPY
120 	jb		$memcpy_ic_3				// tiny? skip mmx copy
121 
122 	cmp		ecx, 32*1024				// don't align between 32k-64k because
123 	jbe		$memcpy_do_align			//  it appears to be slower
124 	cmp		ecx, 64*1024
125 	jbe		$memcpy_align_done
126 $memcpy_do_align:
127 	mov		ecx, 8						// a trick that's faster than rep movsb...
128 	sub		ecx, edi					// align destination to qword
129 	and		ecx, 111b					// get the low bits
130 	sub		ebx, ecx					// update copy count
131 	neg		ecx							// set up to jump into the array
132 	add		ecx, offset $memcpy_align_done
133 	jmp		ecx							// jump to array of movsb's
134 
135 align 4
136 	movsb
137 	movsb
138 	movsb
139 	movsb
140 	movsb
141 	movsb
142 	movsb
143 	movsb
144 
145 $memcpy_align_done:						// destination is dword aligned
146 	mov		ecx, ebx					// number of bytes left to copy
147 	shr		ecx, 6						// get 64-byte block count
148 	jz		$memcpy_ic_2				// finish the last few bytes
149 
150 	cmp		ecx, IN_CACHE_COPY/64		// too big 4 cache? use uncached copy
151 	jae		$memcpy_uc_test
152 
153 // This is small block copy that uses the MMX registers to copy 8 bytes
154 // at a time.  It uses the "unrolled loop" optimization, and also uses
155 // the software prefetch instruction to get the data into the cache.
156 align 16
157 $memcpy_ic_1:							// 64-byte block copies, in-cache copy
158 
159 	PREFETCH([esi + (200*64/34+192)])	// start reading ahead
160 
161 	movq	mm0, [esi+0]				// read 64 bits
162 	movq	mm1, [esi+8]
163 	movq	[edi+0], mm0				// write 64 bits
164 	movq	[edi+8], mm1				//    note:  the normal movq writes the
165 	movq	mm2, [esi+16]				//    data to cache; a cache line will be
166 	movq	mm3, [esi+24]				//    allocated as needed, to store the data
167 	movq	[edi+16], mm2
168 	movq	[edi+24], mm3
169 	movq	mm0, [esi+32]
170 	movq	mm1, [esi+40]
171 	movq	[edi+32], mm0
172 	movq	[edi+40], mm1
173 	movq	mm2, [esi+48]
174 	movq	mm3, [esi+56]
175 	movq	[edi+48], mm2
176 	movq	[edi+56], mm3
177 
178 	add		esi, 64						// update source pointer
179 	add		edi, 64						// update destination pointer
180 	dec		ecx							// count down
181 	jnz		$memcpy_ic_1				// last 64-byte block?
182 
183 $memcpy_ic_2:
184 	mov		ecx, ebx					// has valid low 6 bits of the byte count
185 $memcpy_ic_3:
186 	shr		ecx, 2						// dword count
187 	and		ecx, 1111b					// only look at the "remainder" bits
188 	neg		ecx							// set up to jump into the array
189 	add		ecx, offset $memcpy_last_few
190 	jmp		ecx							// jump to array of movsd's
191 
192 $memcpy_uc_test:
193 	cmp		ecx, UNCACHED_COPY/64		// big enough? use block prefetch copy
194 	jae		$memcpy_bp_1
195 
196 $memcpy_64_test:
197 	or		ecx, ecx					// tail end of block prefetch will jump here
198 	jz		$memcpy_ic_2				// no more 64-byte blocks left
199 
200 // For larger blocks, which will spill beyond the cache, it's faster to
201 // use the Streaming Store instruction MOVNTQ.   This write instruction
202 // bypasses the cache and writes straight to main memory.  This code also
203 // uses the software prefetch instruction to pre-read the data.
204 align 16
205 $memcpy_uc_1:							// 64-byte blocks, uncached copy
206 
207 	PREFETCH ([esi + (200*64/34+192)])	// start reading ahead
208 
209 	movq	mm0,[esi+0]					// read 64 bits
210 	add		edi,64						// update destination pointer
211 	movq	mm1,[esi+8]
212 	add		esi,64						// update source pointer
213 	movq	mm2,[esi-48]
214 	MOVNTQ	[edi-64], mm0				// write 64 bits, bypassing the cache
215 	movq	mm0,[esi-40]				//    note: movntq also prevents the CPU
216 	MOVNTQ	[edi-56], mm1				//    from READING the destination address
217 	movq	mm1,[esi-32]				//    into the cache, only to be over-written
218 	MOVNTQ	[edi-48], mm2				//    so that also helps performance
219 	movq	mm2,[esi-24]
220 	MOVNTQ	[edi-40], mm0
221 	movq	mm0,[esi-16]
222 	MOVNTQ	[edi-32], mm1
223 	movq	mm1,[esi-8]
224 	MOVNTQ	[edi-24], mm2
225 	MOVNTQ	[edi-16], mm0
226 	dec		ecx
227 	MOVNTQ	[edi-8], mm1
228 	jnz		$memcpy_uc_1				// last 64-byte block?
229 
230 	jmp		$memcpy_ic_2				// almost done
231 
232 // For the largest size blocks, a special technique called Block Prefetch
233 // can be used to accelerate the read operations.   Block Prefetch reads
234 // one address per cache line, for a series of cache lines, in a short loop.
235 // This is faster than using software prefetch, in this case.
236 // The technique is great for getting maximum read bandwidth,
237 // especially in DDR memory systems.
238 $memcpy_bp_1:							// large blocks, block prefetch copy
239 
240 	cmp		ecx, CACHEBLOCK				// big enough to run another prefetch loop?
241 	jl		$memcpy_64_test				// no, back to regular uncached copy
242 
243 	mov		eax, CACHEBLOCK / 2			// block prefetch loop, unrolled 2X
244 	add		esi, CACHEBLOCK * 64		// move to the top of the block
245 align 16
246 $memcpy_bp_2:
247 	mov		edx, [esi-64]				// grab one address per cache line
248 	mov		edx, [esi-128]				// grab one address per cache line
249 	sub		esi, 128					// go reverse order
250 	dec		eax							// count down the cache lines
251 	jnz		$memcpy_bp_2				// keep grabbing more lines into cache
252 
253 	mov		eax, CACHEBLOCK				// now that it's in cache, do the copy
254 align 16
255 $memcpy_bp_3:
256 	movq	mm0, [esi   ]				// read 64 bits
257 	movq	mm1, [esi+ 8]
258 	movq	mm2, [esi+16]
259 	movq	mm3, [esi+24]
260 	movq	mm4, [esi+32]
261 	movq	mm5, [esi+40]
262 	movq	mm6, [esi+48]
263 	movq	mm7, [esi+56]
264 	add		esi, 64						// update source pointer
265 	MOVNTQ	[edi   ], mm0				// write 64 bits, bypassing cache
266 	MOVNTQ	[edi+ 8], mm1				//    note: movntq also prevents the CPU
267 	MOVNTQ	[edi+16], mm2				//    from READING the destination address
268 	MOVNTQ	[edi+24], mm3				//    into the cache, only to be over-written,
269 	MOVNTQ	[edi+32], mm4				//    so that also helps performance
270 	MOVNTQ	[edi+40], mm5
271 	MOVNTQ	[edi+48], mm6
272 	MOVNTQ	[edi+56], mm7
273 	add		edi, 64						// update dest pointer
274 
275 	dec		eax							// count down
276 
277 	jnz		$memcpy_bp_3				// keep copying
278 	sub		ecx, CACHEBLOCK				// update the 64-byte block count
279 	jmp		$memcpy_bp_1				// keep processing chunks
280 
281 // The smallest copy uses the X86 "movsd" instruction, in an optimized
282 // form which is an "unrolled loop".   Then it handles the last few bytes.
283 align 4
284 	movsd
285 	movsd								// perform last 1-15 dword copies
286 	movsd
287 	movsd
288 	movsd
289 	movsd
290 	movsd
291 	movsd
292 	movsd
293 	movsd								// perform last 1-7 dword copies
294 	movsd
295 	movsd
296 	movsd
297 	movsd
298 	movsd
299 	movsd
300 
301 $memcpy_last_few:						// dword aligned from before movsd's
302 	mov		ecx, ebx					// has valid low 2 bits of the byte count
303 	and		ecx, 11b					// the last few cows must come home
304 	jz		$memcpy_final				// no more, let's leave
305 	rep		movsb						// the last 1, 2, or 3 bytes
306 
307 $memcpy_final:
308 	emms								// clean up the MMX state
309 	SFENCE								// flush the write buffer
310 	mov		eax, [dest]					// ret value = destination pointer
311 
312     }
313 	return dest;
314 }
315 
316 /*
317 ================
318 MMX_Memcpy8B
319 ================
320 */
MMX_Memcpy8B(void * dest,const void * src,const int count)321 static void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
322 	_asm {
323         mov		esi, src
324         mov		edi, dest
325         mov		ecx, count
326         shr		ecx, 3			// 8 bytes per iteration
327 
328 loop1:
329         movq	mm1,  0[ESI]	// Read in source data
330         MOVNTQ	0[EDI], mm1		// Non-temporal stores
331 
332         add		esi, 8
333         add		edi, 8
334         dec		ecx
335         jnz		loop1
336 
337 	}
338 	EMMS_INSTRUCTION
339 }
340 
341 /*
342 ================
343 MMX_Memcpy64B
344 
345   165MB/sec
346 ================
347 */
MMX_Memcpy64B(void * dest,const void * src,const int count)348 static void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
349 	_asm {
350         mov		esi, src
351         mov		edi, dest
352         mov		ecx, count
353         shr		ecx, 6		// 64 bytes per iteration
354 
355 loop1:
356         PREFETCH (64[ESI])	// Prefetch next loop, non-temporal
357         PREFETCH (96[ESI])
358 
359         movq mm1,  0[ESI]	// Read in source data
360         movq mm2,  8[ESI]
361         movq mm3, 16[ESI]
362         movq mm4, 24[ESI]
363         movq mm5, 32[ESI]
364         movq mm6, 40[ESI]
365         movq mm7, 48[ESI]
366         movq mm0, 56[ESI]
367 
368         MOVNTQ  0[EDI], mm1	// Non-temporal stores
369         MOVNTQ  8[EDI], mm2
370         MOVNTQ 16[EDI], mm3
371         MOVNTQ 24[EDI], mm4
372         MOVNTQ 32[EDI], mm5
373         MOVNTQ 40[EDI], mm6
374         MOVNTQ 48[EDI], mm7
375         MOVNTQ 56[EDI], mm0
376 
377         add		esi, 64
378         add		edi, 64
379         dec		ecx
380         jnz		loop1
381 	}
382 	EMMS_INSTRUCTION
383 }
384 
385 /*
386 ================
387 MMX_Memcpy2kB
388 
389   240MB/sec
390 ================
391 */
392 #define _alloca16( x )					((void *)((((int)_alloca( (x)+15 )) + 15) & ~15))
393 
MMX_Memcpy2kB(void * dest,const void * src,const int count)394 static void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
395 	byte *tbuf = (byte *)_alloca16(2048);
396 	__asm {
397 		push	ebx
398         mov		esi, src
399         mov		ebx, count
400         shr		ebx, 11		// 2048 bytes at a time
401         mov		edi, dest
402 
403 loop2k:
404         push	edi			// copy 2k into temporary buffer
405         mov		edi, tbuf
406         mov		ecx, 32
407 
408 loopMemToL1:
409         PREFETCH (64[ESI]) // Prefetch next loop, non-temporal
410         PREFETCH (96[ESI])
411 
412         movq mm1,  0[ESI]	// Read in source data
413         movq mm2,  8[ESI]
414         movq mm3, 16[ESI]
415         movq mm4, 24[ESI]
416         movq mm5, 32[ESI]
417         movq mm6, 40[ESI]
418         movq mm7, 48[ESI]
419         movq mm0, 56[ESI]
420 
421         movq  0[EDI], mm1	// Store into L1
422         movq  8[EDI], mm2
423         movq 16[EDI], mm3
424         movq 24[EDI], mm4
425         movq 32[EDI], mm5
426         movq 40[EDI], mm6
427         movq 48[EDI], mm7
428         movq 56[EDI], mm0
429         add		esi, 64
430         add		edi, 64
431         dec		ecx
432         jnz		loopMemToL1
433 
434         pop		edi			// Now copy from L1 to system memory
435         push	esi
436         mov		esi, tbuf
437         mov		ecx, 32
438 
439 loopL1ToMem:
440         movq mm1, 0[ESI]	// Read in source data from L1
441         movq mm2, 8[ESI]
442         movq mm3, 16[ESI]
443         movq mm4, 24[ESI]
444         movq mm5, 32[ESI]
445         movq mm6, 40[ESI]
446         movq mm7, 48[ESI]
447         movq mm0, 56[ESI]
448 
449         MOVNTQ 0[EDI], mm1	// Non-temporal stores
450         MOVNTQ 8[EDI], mm2
451         MOVNTQ 16[EDI], mm3
452         MOVNTQ 24[EDI], mm4
453         MOVNTQ 32[EDI], mm5
454         MOVNTQ 40[EDI], mm6
455         MOVNTQ 48[EDI], mm7
456         MOVNTQ 56[EDI], mm0
457 
458         add		esi, 64
459         add		edi, 64
460         dec		ecx
461         jnz		loopL1ToMem
462 
463         pop		esi			// Do next 2k block
464         dec		ebx
465         jnz		loop2k
466 		pop		ebx
467 	}
468 	EMMS_INSTRUCTION
469 }
470 
471 
472 /*
473 ================
474 idSIMD_MMX::Memcpy
475 
476   optimized memory copy routine that handles all alignment cases and block sizes efficiently
477 ================
478 */
memcpy_MMX(void * dest0,const void * src0,size_t count0)479 static void* memcpy_MMX( void *dest0, const void *src0, size_t count0 ) {
480 	// if copying more than 16 bytes and we can copy 8 byte aligned
481 	if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
482 		byte *dest = (byte *)dest0;
483 		byte *src = (byte *)src0;
484 
485 		// copy up to the first 8 byte aligned boundary
486 		int count = ((int)dest) & 7;
487 		memcpy( dest, src, count );
488 		dest += count;
489 		src += count;
490 		count = count0 - count;
491 
492 		// if there are multiple blocks of 2kB
493 		if ( count & ~4095 ) {
494 			MMX_Memcpy2kB( dest, src, count );
495 			src += (count & ~2047);
496 			dest += (count & ~2047);
497 			count &= 2047;
498 		}
499 
500 		// if there are blocks of 64 bytes
501 		if ( count & ~63 ) {
502 			MMX_Memcpy64B( dest, src, count );
503 			src += (count & ~63);
504 			dest += (count & ~63);
505 			count &= 63;
506 		}
507 
508 		// if there are blocks of 8 bytes
509 		if ( count & ~7 ) {
510 			MMX_Memcpy8B( dest, src, count );
511 			src += (count & ~7);
512 			dest += (count & ~7);
513 			count &= 7;
514 		}
515 
516 		// copy any remaining bytes
517 		memcpy( dest, src, count );
518 	} else {
519 		// use the regular one if we cannot copy 8 byte aligned
520 		memcpy( dest0, src0, count0 );
521 	}
522 	return dest0;
523 }
524 
525 /*
526 ================
527 idSIMD_MMX::Memset
528 ================
529 */
memset_MMX(void * dest0,int val,size_t count0)530 static void* memset_MMX( void* dest0, int val, size_t count0 ) {
531 	union {
532 		byte	bytes[8];
533 		unsigned short	words[4];
534 		unsigned int	dwords[2];
535 	} dat;
536 
537 	byte *dest = (byte *)dest0;
538 	int count = count0;
539 
540 	while( count > 0 && (((int)dest) & 7) ) {
541 		*dest = val;
542 		dest++;
543 		count--;
544 	}
545 	if ( !count ) {
546   	return dest0;
547 	}
548 
549 	dat.bytes[0] = val;
550 	dat.bytes[1] = val;
551 	dat.words[1] = dat.words[0];
552 	dat.dwords[1] = dat.dwords[0];
553 
554 	if ( count >= 64 ) {
555 		__asm {
556 			mov edi, dest
557 			mov ecx, count
558 			shr ecx, 6				// 64 bytes per iteration
559 			movq mm1, dat			// Read in source data
560 			movq mm2, mm1
561 			movq mm3, mm1
562 			movq mm4, mm1
563 			movq mm5, mm1
564 			movq mm6, mm1
565 			movq mm7, mm1
566 			movq mm0, mm1
567 loop1:
568 			MOVNTQ  0[EDI], mm1		// Non-temporal stores
569 			MOVNTQ  8[EDI], mm2
570 			MOVNTQ 16[EDI], mm3
571 			MOVNTQ 24[EDI], mm4
572 			MOVNTQ 32[EDI], mm5
573 			MOVNTQ 40[EDI], mm6
574 			MOVNTQ 48[EDI], mm7
575 			MOVNTQ 56[EDI], mm0
576 
577 			add edi, 64
578 			dec ecx
579 			jnz loop1
580 		}
581 		dest += ( count & ~63 );
582 		count &= 63;
583 	}
584 
585 	if ( count >= 8 ) {
586 		__asm {
587 			mov edi, dest
588 			mov ecx, count
589 			shr ecx, 3				// 8 bytes per iteration
590 			movq mm1, dat			// Read in source data
591 loop2:
592 			MOVNTQ  0[EDI], mm1		// Non-temporal stores
593 
594 			add edi, 8
595 			dec ecx
596 			jnz loop2
597 		}
598 		dest += (count & ~7);
599 		count &= 7;
600 	}
601 
602 	while( count > 0 ) {
603 		*dest = val;
604 		dest++;
605 		count--;
606 	}
607 
608 	EMMS_INSTRUCTION
609 
610 	return dest0;
611 }
612 #endif // SIMD_INSTRUCTIONS
613 #endif // _WIN32
614