prboom-plus-2.5.1.3/src/i_simd.c

// Copyright (C) 2004 Id Software, Inc.
//

//===============================================================
//
//	3DNow! implementation of idSIMDProcessor
//
//===============================================================

#ifdef HAVE_CONFIG_H
#include "config.h"
#endif

#ifdef _WIN32
#ifdef SIMD_INSTRUCTIONS
#define WIN32_LEAN_AND_MEAN
#include <windows.h>

#include <stdio.h>

#include "doomtype.h"
#include "m_argv.h"
#include "SDL_cpuinfo.h"
#include "i_simd.h"

memcpy_fast_f memcpy_fast;
memset_fast_f memset_fast;

static void* memcpy_MMX( void *dst, const void *src, size_t count );
static void* memset_MMX( void *dst, int val, size_t count );
static void* memcpy_3DNow( void *dst, const void *src, size_t count );

void I_InitSIMD(void)
{
  memcpy_fast = memcpy;
  memset_fast = memset;

  if (!M_CheckParm("-nosimd"))
  {
    if (SDL_Has3DNow() && !M_CheckParm("-no3dnow"))
    {
      memcpy_fast = memcpy_3DNow;
      fprintf(stdout, "I_Init: using MMX and 3DNow! for SIMD processing\n");
    }
    else
    {
      if (SDL_HasMMX() && !M_CheckParm("-nommx"))
      {
        memcpy_fast = memcpy_MMX;
        memset_fast = memset_MMX;
        fprintf(stdout, "I_Init: using MMX for SIMD processing\n");
      }
    }
  }
}

#define EMMS_INSTRUCTION __asm emms
#if _MSC_VER > 1300
#define PREFETCH(a) prefetchnta a
#define MOVNTQ movntq
#define SFENCE sfence
#else
#define PREFETCH(a)
#define MOVNTQ movq
#define SFENCE
#endif

static void* memcpy_MMX( void *dst, const void *src, size_t count );
static void* memset_MMX( void *dst, int val, size_t count );
static void* memcpy_3DNow( void *dst, const void *src, size_t count );

// Very optimized memcpy() routine for all AMD Athlon and Duron family.
// This code uses any of FOUR different basic copy methods, depending
// on the transfer size.
// NOTE:  Since this code uses MOVNTQ (also known as "Non-Temporal MOV" or
// "Streaming Store"), and also uses the software prefetchnta instructions,
// be sure you're running on Athlon/Duron or other recent CPU before calling!

#define TINY_BLOCK_COPY 64       // upper limit for movsd type copy
// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".

#define IN_CACHE_COPY 64 * 1024  // upper limit for movq/movq copy w/SW prefetch
// Next is a copy that uses the MMX registers to copy 8 bytes at a time,
// also using the "unrolled loop" optimization.   This code uses
// the software prefetch instruction to get the data into the cache.

#define UNCACHED_COPY 197 * 1024 // upper limit for movq/movntq w/SW prefetch
// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
// USE 64 * 1024 FOR THIS VALUE IF YOU'RE ALWAYS FILLING A "CLEAN CACHE"

#define BLOCK_PREFETCH_COPY  infinity // no limit for movq/movntq w/block prefetch
#define CACHEBLOCK 80h // number of 64-byte blocks (cache lines) for block prefetch
// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch.  The technique is great for
// getting maximum read bandwidth, especially in DDR memory systems.

/*
================
idSIMD_3DNow::Memcpy

  optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
static void* memcpy_3DNow( void *dest, const void *src, size_t n ) {
  __asm {

	mov		ecx, [n]					// number of bytes to copy
	mov		edi, [dest]					// destination
	mov		esi, [src]					// source
	mov		ebx, ecx					// keep a copy of count

	cld
	cmp		ecx, TINY_BLOCK_COPY
	jb		$memcpy_ic_3				// tiny? skip mmx copy

	cmp		ecx, 32*1024				// don't align between 32k-64k because
	jbe		$memcpy_do_align			//  it appears to be slower
	cmp		ecx, 64*1024
	jbe		$memcpy_align_done
$memcpy_do_align:
	mov		ecx, 8						// a trick that's faster than rep movsb...
	sub		ecx, edi					// align destination to qword
	and		ecx, 111b					// get the low bits
	sub		ebx, ecx					// update copy count
	neg		ecx							// set up to jump into the array
	add		ecx, offset $memcpy_align_done
	jmp		ecx							// jump to array of movsb's

align 4
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb
	movsb

$memcpy_align_done:						// destination is dword aligned
	mov		ecx, ebx					// number of bytes left to copy
	shr		ecx, 6						// get 64-byte block count
	jz		$memcpy_ic_2				// finish the last few bytes

	cmp		ecx, IN_CACHE_COPY/64		// too big 4 cache? use uncached copy
	jae		$memcpy_uc_test

// This is small block copy that uses the MMX registers to copy 8 bytes
// at a time.  It uses the "unrolled loop" optimization, and also uses
// the software prefetch instruction to get the data into the cache.
align 16
$memcpy_ic_1:							// 64-byte block copies, in-cache copy

	PREFETCH([esi + (200*64/34+192)])	// start reading ahead

	movq	mm0, [esi+0]				// read 64 bits
	movq	mm1, [esi+8]
	movq	[edi+0], mm0				// write 64 bits
	movq	[edi+8], mm1				//    note:  the normal movq writes the
	movq	mm2, [esi+16]				//    data to cache; a cache line will be
	movq	mm3, [esi+24]				//    allocated as needed, to store the data
	movq	[edi+16], mm2
	movq	[edi+24], mm3
	movq	mm0, [esi+32]
	movq	mm1, [esi+40]
	movq	[edi+32], mm0
	movq	[edi+40], mm1
	movq	mm2, [esi+48]
	movq	mm3, [esi+56]
	movq	[edi+48], mm2
	movq	[edi+56], mm3

	add		esi, 64						// update source pointer
	add		edi, 64						// update destination pointer
	dec		ecx							// count down
	jnz		$memcpy_ic_1				// last 64-byte block?

$memcpy_ic_2:
	mov		ecx, ebx					// has valid low 6 bits of the byte count
$memcpy_ic_3:
	shr		ecx, 2						// dword count
	and		ecx, 1111b					// only look at the "remainder" bits
	neg		ecx							// set up to jump into the array
	add		ecx, offset $memcpy_last_few
	jmp		ecx							// jump to array of movsd's

$memcpy_uc_test:
	cmp		ecx, UNCACHED_COPY/64		// big enough? use block prefetch copy
	jae		$memcpy_bp_1

$memcpy_64_test:
	or		ecx, ecx					// tail end of block prefetch will jump here
	jz		$memcpy_ic_2				// no more 64-byte blocks left

// For larger blocks, which will spill beyond the cache, it's faster to
// use the Streaming Store instruction MOVNTQ.   This write instruction
// bypasses the cache and writes straight to main memory.  This code also
// uses the software prefetch instruction to pre-read the data.
align 16
$memcpy_uc_1:							// 64-byte blocks, uncached copy

	PREFETCH ([esi + (200*64/34+192)])	// start reading ahead

	movq	mm0,[esi+0]					// read 64 bits
	add		edi,64						// update destination pointer
	movq	mm1,[esi+8]
	add		esi,64						// update source pointer
	movq	mm2,[esi-48]
	MOVNTQ	[edi-64], mm0				// write 64 bits, bypassing the cache
	movq	mm0,[esi-40]				//    note: movntq also prevents the CPU
	MOVNTQ	[edi-56], mm1				//    from READING the destination address
	movq	mm1,[esi-32]				//    into the cache, only to be over-written
	MOVNTQ	[edi-48], mm2				//    so that also helps performance
	movq	mm2,[esi-24]
	MOVNTQ	[edi-40], mm0
	movq	mm0,[esi-16]
	MOVNTQ	[edi-32], mm1
	movq	mm1,[esi-8]
	MOVNTQ	[edi-24], mm2
	MOVNTQ	[edi-16], mm0
	dec		ecx
	MOVNTQ	[edi-8], mm1
	jnz		$memcpy_uc_1				// last 64-byte block?

	jmp		$memcpy_ic_2				// almost done

// For the largest size blocks, a special technique called Block Prefetch
// can be used to accelerate the read operations.   Block Prefetch reads
// one address per cache line, for a series of cache lines, in a short loop.
// This is faster than using software prefetch, in this case.
// The technique is great for getting maximum read bandwidth,
// especially in DDR memory systems.
$memcpy_bp_1:							// large blocks, block prefetch copy

	cmp		ecx, CACHEBLOCK				// big enough to run another prefetch loop?
	jl		$memcpy_64_test				// no, back to regular uncached copy

	mov		eax, CACHEBLOCK / 2			// block prefetch loop, unrolled 2X
	add		esi, CACHEBLOCK * 64		// move to the top of the block
align 16
$memcpy_bp_2:
	mov		edx, [esi-64]				// grab one address per cache line
	mov		edx, [esi-128]				// grab one address per cache line
	sub		esi, 128					// go reverse order
	dec		eax							// count down the cache lines
	jnz		$memcpy_bp_2				// keep grabbing more lines into cache

	mov		eax, CACHEBLOCK				// now that it's in cache, do the copy
align 16
$memcpy_bp_3:
	movq	mm0, [esi   ]				// read 64 bits
	movq	mm1, [esi+ 8]
	movq	mm2, [esi+16]
	movq	mm3, [esi+24]
	movq	mm4, [esi+32]
	movq	mm5, [esi+40]
	movq	mm6, [esi+48]
	movq	mm7, [esi+56]
	add		esi, 64						// update source pointer
	MOVNTQ	[edi   ], mm0				// write 64 bits, bypassing cache
	MOVNTQ	[edi+ 8], mm1				//    note: movntq also prevents the CPU
	MOVNTQ	[edi+16], mm2				//    from READING the destination address
	MOVNTQ	[edi+24], mm3				//    into the cache, only to be over-written,
	MOVNTQ	[edi+32], mm4				//    so that also helps performance
	MOVNTQ	[edi+40], mm5
	MOVNTQ	[edi+48], mm6
	MOVNTQ	[edi+56], mm7
	add		edi, 64						// update dest pointer

	dec		eax							// count down

	jnz		$memcpy_bp_3				// keep copying
	sub		ecx, CACHEBLOCK				// update the 64-byte block count
	jmp		$memcpy_bp_1				// keep processing chunks

// The smallest copy uses the X86 "movsd" instruction, in an optimized
// form which is an "unrolled loop".   Then it handles the last few bytes.
align 4
	movsd
	movsd								// perform last 1-15 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd								// perform last 1-7 dword copies
	movsd
	movsd
	movsd
	movsd
	movsd
	movsd

$memcpy_last_few:						// dword aligned from before movsd's
	mov		ecx, ebx					// has valid low 2 bits of the byte count
	and		ecx, 11b					// the last few cows must come home
	jz		$memcpy_final				// no more, let's leave
	rep		movsb						// the last 1, 2, or 3 bytes

$memcpy_final:
	emms								// clean up the MMX state
	SFENCE								// flush the write buffer
	mov		eax, [dest]					// ret value = destination pointer

    }
	return dest;
}

/*
================
MMX_Memcpy8B
================
*/
static void MMX_Memcpy8B( void *dest, const void *src, const int count ) {
	_asm {
        mov		esi, src
        mov		edi, dest
        mov		ecx, count
        shr		ecx, 3			// 8 bytes per iteration

loop1:
        movq	mm1,  0[ESI]	// Read in source data
        MOVNTQ	0[EDI], mm1		// Non-temporal stores

        add		esi, 8
        add		edi, 8
        dec		ecx
        jnz		loop1

	}
	EMMS_INSTRUCTION
}

/*
================
MMX_Memcpy64B

  165MB/sec
================
*/
static void MMX_Memcpy64B( void *dest, const void *src, const int count ) {
	_asm {
        mov		esi, src
        mov		edi, dest
        mov		ecx, count
        shr		ecx, 6		// 64 bytes per iteration

loop1:
        PREFETCH (64[ESI])	// Prefetch next loop, non-temporal
        PREFETCH (96[ESI])

        movq mm1,  0[ESI]	// Read in source data
        movq mm2,  8[ESI]
        movq mm3, 16[ESI]
        movq mm4, 24[ESI]
        movq mm5, 32[ESI]
        movq mm6, 40[ESI]
        movq mm7, 48[ESI]
        movq mm0, 56[ESI]

        MOVNTQ  0[EDI], mm1	// Non-temporal stores
        MOVNTQ  8[EDI], mm2
        MOVNTQ 16[EDI], mm3
        MOVNTQ 24[EDI], mm4
        MOVNTQ 32[EDI], mm5
        MOVNTQ 40[EDI], mm6
        MOVNTQ 48[EDI], mm7
        MOVNTQ 56[EDI], mm0

        add		esi, 64
        add		edi, 64
        dec		ecx
        jnz		loop1
	}
	EMMS_INSTRUCTION
}

/*
================
MMX_Memcpy2kB

  240MB/sec
================
*/
#define _alloca16( x )					((void *)((((int)_alloca( (x)+15 )) + 15) & ~15))

static void MMX_Memcpy2kB( void *dest, const void *src, const int count ) {
	byte *tbuf = (byte *)_alloca16(2048);
	__asm {
		push	ebx
        mov		esi, src
        mov		ebx, count
        shr		ebx, 11		// 2048 bytes at a time
        mov		edi, dest

loop2k:
        push	edi			// copy 2k into temporary buffer
        mov		edi, tbuf
        mov		ecx, 32

loopMemToL1:
        PREFETCH (64[ESI]) // Prefetch next loop, non-temporal
        PREFETCH (96[ESI])

        movq mm1,  0[ESI]	// Read in source data
        movq mm2,  8[ESI]
        movq mm3, 16[ESI]
        movq mm4, 24[ESI]
        movq mm5, 32[ESI]
        movq mm6, 40[ESI]
        movq mm7, 48[ESI]
        movq mm0, 56[ESI]

        movq  0[EDI], mm1	// Store into L1
        movq  8[EDI], mm2
        movq 16[EDI], mm3
        movq 24[EDI], mm4
        movq 32[EDI], mm5
        movq 40[EDI], mm6
        movq 48[EDI], mm7
        movq 56[EDI], mm0
        add		esi, 64
        add		edi, 64
        dec		ecx
        jnz		loopMemToL1

        pop		edi			// Now copy from L1 to system memory
        push	esi
        mov		esi, tbuf
        mov		ecx, 32

loopL1ToMem:
        movq mm1, 0[ESI]	// Read in source data from L1
        movq mm2, 8[ESI]
        movq mm3, 16[ESI]
        movq mm4, 24[ESI]
        movq mm5, 32[ESI]
        movq mm6, 40[ESI]
        movq mm7, 48[ESI]
        movq mm0, 56[ESI]

        MOVNTQ 0[EDI], mm1	// Non-temporal stores
        MOVNTQ 8[EDI], mm2
        MOVNTQ 16[EDI], mm3
        MOVNTQ 24[EDI], mm4
        MOVNTQ 32[EDI], mm5
        MOVNTQ 40[EDI], mm6
        MOVNTQ 48[EDI], mm7
        MOVNTQ 56[EDI], mm0

        add		esi, 64
        add		edi, 64
        dec		ecx
        jnz		loopL1ToMem

        pop		esi			// Do next 2k block
        dec		ebx
        jnz		loop2k
		pop		ebx
	}
	EMMS_INSTRUCTION
}


/*
================
idSIMD_MMX::Memcpy

  optimized memory copy routine that handles all alignment cases and block sizes efficiently
================
*/
static void* memcpy_MMX( void *dest0, const void *src0, size_t count0 ) {
	// if copying more than 16 bytes and we can copy 8 byte aligned
	if ( count0 > 16 && !( ( (int)dest0 ^ (int)src0 ) & 7 ) ) {
		byte *dest = (byte *)dest0;
		byte *src = (byte *)src0;

		// copy up to the first 8 byte aligned boundary
		int count = ((int)dest) & 7;
		memcpy( dest, src, count );
		dest += count;
		src += count;
		count = count0 - count;

		// if there are multiple blocks of 2kB
		if ( count & ~4095 ) {
			MMX_Memcpy2kB( dest, src, count );
			src += (count & ~2047);
			dest += (count & ~2047);
			count &= 2047;
		}

		// if there are blocks of 64 bytes
		if ( count & ~63 ) {
			MMX_Memcpy64B( dest, src, count );
			src += (count & ~63);
			dest += (count & ~63);
			count &= 63;
		}

		// if there are blocks of 8 bytes
		if ( count & ~7 ) {
			MMX_Memcpy8B( dest, src, count );
			src += (count & ~7);
			dest += (count & ~7);
			count &= 7;
		}

		// copy any remaining bytes
		memcpy( dest, src, count );
	} else {
		// use the regular one if we cannot copy 8 byte aligned
		memcpy( dest0, src0, count0 );
	}
	return dest0;
}

/*
================
idSIMD_MMX::Memset
================
*/
static void* memset_MMX( void* dest0, int val, size_t count0 ) {
	union {
		byte	bytes[8];
		unsigned short	words[4];
		unsigned int	dwords[2];
	} dat;

	byte *dest = (byte *)dest0;
	int count = count0;

	while( count > 0 && (((int)dest) & 7) ) {
		*dest = val;
		dest++;
		count--;
	}
	if ( !count ) {
  	return dest0;
	}

	dat.bytes[0] = val;
	dat.bytes[1] = val;
	dat.words[1] = dat.words[0];
	dat.dwords[1] = dat.dwords[0];

	if ( count >= 64 ) {
		__asm {
			mov edi, dest
			mov ecx, count
			shr ecx, 6				// 64 bytes per iteration
			movq mm1, dat			// Read in source data
			movq mm2, mm1
			movq mm3, mm1
			movq mm4, mm1
			movq mm5, mm1
			movq mm6, mm1
			movq mm7, mm1
			movq mm0, mm1
loop1:
			MOVNTQ  0[EDI], mm1		// Non-temporal stores
			MOVNTQ  8[EDI], mm2
			MOVNTQ 16[EDI], mm3
			MOVNTQ 24[EDI], mm4
			MOVNTQ 32[EDI], mm5
			MOVNTQ 40[EDI], mm6
			MOVNTQ 48[EDI], mm7
			MOVNTQ 56[EDI], mm0

			add edi, 64
			dec ecx
			jnz loop1
		}
		dest += ( count & ~63 );
		count &= 63;
	}

	if ( count >= 8 ) {
		__asm {
			mov edi, dest
			mov ecx, count
			shr ecx, 3				// 8 bytes per iteration
			movq mm1, dat			// Read in source data
loop2:
			MOVNTQ  0[EDI], mm1		// Non-temporal stores

			add edi, 8
			dec ecx
			jnz loop2
		}
		dest += (count & ~7);
		count &= 7;
	}

	while( count > 0 ) {
		*dest = val;
		dest++;
		count--;
	}

	EMMS_INSTRUCTION

	return dest0;
}
#endif // SIMD_INSTRUCTIONS
#endif // _WIN32