1 /*========================== begin_copyright_notice ============================
2 
3 Copyright (C) 2017-2021 Intel Corporation
4 
5 SPDX-License-Identifier: MIT
6 
7 ============================= end_copyright_notice ===========================*/
8 
9 #pragma once
10 
11 #include "types.h"
12 #include "Debug.h"
13 #include "utility.h"
14 #include <string.h>
15 #include "CpuUtil.h"
16 
17 #if !defined ( _MSC_VER )
18 #include "inc/common/secure_mem.h"
19 #endif
20 
21 #if defined(_WIN32)
22     #include <basetsd.h>
23     #if defined ( _WIN64 ) && defined ( _In_ )
24         // NOTE: <math.h> is not necessary here.
25         // This is only an ugly workaround for a VS2008 bug that causes the compilation
26         // issue on 64-bit DEBUG configuration.
27         // Including "math.h" before "intrin.h" helps to get rid of the following warning:
28         // warning C4985: 'ceil': attributes not present on previous declaration.
29         #include <math.h>
30     #endif
31     #include <intrin.h>
32     #define USE_SSE4_1
33 #else
34     #include <x86intrin.h>
35 #endif
36 
37 typedef __m128              DQWORD;         // 128-bits,   16-bytes
38 typedef DWORD               PREFETCH[8];    //             32-bytes
39 typedef DWORD               CACHELINE[8];   //             32-bytes
40 typedef WORD                DHWORD[32];     // 512-bits,   64-bytes
41 
42 namespace iSTD
43 {
44 
45 enum
46 {
47     DWORD_SHIFT         = 2,
48     BYTE_TAIL           = 3,
49     INSTR_128_SHIFT     = 4,
50     CACHE_LINE_SHIFT    = 6,
51     DUAL_CACHE_SHIFT    = 7,
52     TAIL_SIZE           = 15,
53     INSTR_WIDTH_128     = 16,
54     INSTR_WIDTH_256     = 32,
55     CACHE_LINE_SIZE     = 64,
56     TIERED_TAIL         = 127,
57     DUAL_CACHE_SIZE     = 128,
58     MIN_ERMSB_ALIGNED   = 4096,
59     MIN_STREAM_SIZE     = 524288,
60 };
61 
62 #ifdef _WIN64
63 #   define USE_INLINE_ASM 0
64 #else
65 #   if defined _MSC_VER
66 #       define USE_INLINE_ASM 1
67 #   else
68 #       define USE_INLINE_ASM 0
69 #   endif
70 #endif
71 
72 /*****************************************************************************\
73 Function Prototypes
74 \*****************************************************************************/
75 inline void Prefetch( const void* );
76 inline void PrefetchBuffer( const void*, const size_t );
77 inline void CachelineFlush( const void* );
78 
79 template <size_t size>
80 inline void MemCopy( void*, const void* );
81 inline void MemCopy( void*, const void*, const size_t );
82 inline void MemCopyWC( void*, const void*, const size_t );
83 inline void MemCopySwapBytes( void*, const void*, const size_t, const unsigned int);
84 inline void ScalarSwapBytes( __m128i**, const __m128i**, const size_t, const unsigned int);
85 
86 inline void SafeMemSet( void*, const int, const size_t );
87 inline int  SafeMemCompare( const void*, const void*, const size_t );
88 inline void SafeMemMove( void*, const void*, const size_t );
89 
90 #ifndef _WIN64
91 inline void  __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa(void* dst, const void* src );
92 inline void  __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src );
93 #endif
94 inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel);
95 
96 inline void FastCpuBlt( BYTE*, const DWORD, BYTE*, const DWORD, const DWORD, DWORD );
97 
98 inline void FindWordBufferMinMax( WORD*, const DWORD, WORD&, WORD& );
99 inline void FindDWordBufferMinMax( DWORD*, const DWORD, DWORD&, DWORD& );
100 inline void FindWordBufferMinMaxRestart( WORD*, const DWORD, const WORD, WORD&, WORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
101 inline void FindDWordBufferMinMaxRestart( DWORD*, const DWORD, const DWORD, DWORD&, DWORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
102 
103 inline void FindWordBufferMinMaxCopy( WORD*, WORD*, const DWORD, WORD&, WORD& );
104 inline void FindDWordBufferMinMaxCopy( DWORD*, DWORD*, const DWORD, DWORD&, DWORD& );
105 inline void FindWordBufferMinMaxRestartCopy( WORD*, WORD*, const DWORD, const WORD, WORD&, WORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
106 inline void FindDWordBufferMinMaxRestartCopy( DWORD*, DWORD*, const DWORD, const DWORD, DWORD&, DWORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
107 
108 /*****************************************************************************\
109 Inline Function:
110     Prefetch
111 
112 Description:
113     executes __asm prefetchnta
114 \*****************************************************************************/
Prefetch(const void * ptr)115 inline void Prefetch( const void* ptr )
116 {
117     _mm_prefetch( (const char*)ptr, _MM_HINT_NTA );
118 }
119 
120 /*****************************************************************************\
121 Inline Function:
122     PrefetchBuffer
123 
124 Description:
125     executes __asm prefetchnta
126 \*****************************************************************************/
PrefetchBuffer(const void * pBuffer,const size_t bytes)127 inline void PrefetchBuffer( const void* pBuffer, const size_t bytes )
128 {
129     const size_t cachelines = bytes / sizeof(PREFETCH);
130 
131     for( size_t i = 0; i <= cachelines; i++ )
132     {
133         _mm_prefetch( (const char*)pBuffer + i * sizeof(PREFETCH),
134             _MM_HINT_NTA );
135     }
136 }
137 
138 /*****************************************************************************\
139 Inline Function:
140     CachelineFlush
141 
142 Description:
143     executes __asm clflush
144 \*****************************************************************************/
CachelineFlush(const void * ptr)145 inline void CachelineFlush( const void* ptr )
146 {
147     _mm_clflush( (char*)ptr );
148 }
149 
150 /*****************************************************************************\
151 Inline Function:
152     MemCopy
153 
154 Description:
155     Templated Exception Handler Memory Copy function
156 \*****************************************************************************/
157 template <size_t size>
MemCopy(void * dst,const void * src)158 inline void MemCopy( void* dst, const void* src )
159 {
160     MemCopy(dst, src, size);
161 }
162 
163 template <>
164 inline void MemCopy<1>( void* dst, const void* src )
165 {
166     const BYTE* pSrc = reinterpret_cast<const BYTE*>(src);
167     BYTE*       pDst = reinterpret_cast<BYTE*>(dst);
168     *pDst = *pSrc;
169 }
170 
171 template <>
172 inline void MemCopy<2>( void* dst, const void* src )
173 {
174     const WORD* pSrc = reinterpret_cast<const WORD*>(src);
175     WORD*       pDst = reinterpret_cast<WORD*>(dst);
176     *pDst = *pSrc;
177 }
178 
179 template <>
180 inline void MemCopy<4>( void* dst, const void* src )
181 {
182     const UINT32*   pSrc = reinterpret_cast<const UINT32*>(src);
183     UINT32*         pDst = reinterpret_cast<UINT32*>(dst);
184     *pDst = *pSrc;
185 }
186 
187 template <>
188 inline void MemCopy<8>( void* dst, const void* src )
189 {
190     const UINT64*   pSrc = reinterpret_cast<const UINT64*>(src);
191     UINT64*         pDst = reinterpret_cast<UINT64*>(dst);
192     *pDst = *pSrc;
193 }
194 
195 template <>
196 inline void MemCopy<16>( void* dst, const void* src )
197 {
198     const __m128i*  pMMSrc  = reinterpret_cast<const __m128i*>(src);
199     __m128i*        pMMDst  = reinterpret_cast<__m128i*>(dst);
200     __m128i         xmm0    = _mm_loadu_si128(pMMSrc);
201     _mm_storeu_si128(pMMDst, xmm0);
202 }
203 
204 template <>
205 inline void MemCopy<28>( void* dst, const void* src )
206 {
207     const __m128i*  pMMSrc  = reinterpret_cast<const __m128i*>( src );
208     __m128i*        pMMDst  = reinterpret_cast<__m128i*>( dst );
209     __m128i         xmm0    = _mm_loadu_si128( pMMSrc );
210     _mm_storeu_si128( pMMDst, xmm0 );
211 
212     pMMSrc += 1;
213     pMMDst += 1;
214 
215     const UINT64*   pSrc64 = reinterpret_cast<const UINT64*>( pMMSrc );
216     UINT64*         pDst64 = reinterpret_cast<UINT64*>( pMMDst );
217     *pDst64 = *pSrc64;
218 
219     pDst64 += 1;
220     pSrc64 += 1;
221 
222     const UINT32*   pSrc32 = reinterpret_cast<const UINT32*>( pSrc64 );
223     UINT32*         pDst32 = reinterpret_cast<UINT32*>( pDst64 );
224     *pDst32 = *pSrc32;
225 }
226 
227 /*****************************************************************************\
228 Inline Function:
229     MemCopy
230 
231 Description:
232     Exception Handler Memory Copy function
233 \*****************************************************************************/
MemCopy(void * dst,const void * src,const size_t bytes)234 inline void MemCopy( void* dst, const void* src, const size_t bytes )
235 {
236 #if defined ( _MSC_VER )
237     UINT8*            pDst8 = reinterpret_cast<UINT8*>( dst );
238     const UINT8*    pSrc8 = reinterpret_cast<const UINT8*>( src );
239     size_t            bytesRemaining = bytes;
240 
241     // handle invalid cases
242     if( bytesRemaining == 0 )
243         return;
244 
245     // handle sizes <= 4 bytes
246     if( bytesRemaining <= 4 )
247     {
248         if( bytesRemaining == 1 )
249         {
250             // copy 1 bytes
251             *pDst8 = *pSrc8;
252             return;
253         }
254 
255         if( bytesRemaining == 2 )
256         {
257             // copy 2 bytes
258             *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
259             return;
260         }
261 
262         if( bytesRemaining == 3 )
263         {
264             // copy 3 bytes
265             *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
266             *( pDst8 + 2 ) = *( pSrc8 + 2 );
267             return;
268         }
269 
270         *reinterpret_cast<UINT32*>( pDst8 ) = *reinterpret_cast<const UINT32*>( pSrc8 );
271         return;
272     }
273 
274     // align destination to 4 byte boundary if size is > 8 bytes
275     if( bytesRemaining > 8 &&
276         reinterpret_cast<UINT_PTR>( pDst8 ) & 0x3 )
277     {
278         // check for shift by 1
279         if( reinterpret_cast<UINT_PTR>( pDst8 ) & 0x1 )
280         {
281             *pDst8 = *pSrc8;
282 
283             bytesRemaining -= 1;
284             pDst8 += 1;
285             pSrc8 += 1;
286         }
287 
288         // check for shift by 2
289         if( reinterpret_cast<UINT_PTR>( pDst8 ) & 0x2 )
290         {
291             *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
292 
293             bytesRemaining -= 2;
294             pDst8 += 2;
295             pSrc8 += 2;
296         }
297     }
298 
299     // handle sizes <= 64 bytes as series of 4 byte moves
300     if( bytesRemaining <= CACHE_LINE_SIZE )
301     {
302         const size_t ptrAdvance = bytesRemaining & ~0x3; // TODO: Need to see if we can mimic the jump table
303 
304         pDst8 += ptrAdvance;
305         pSrc8 += ptrAdvance;
306 
307         switch( bytesRemaining / 4 )
308         {
309             case 16:
310                 *reinterpret_cast<UINT32*>( pDst8 - 64 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 64 );
311             case 15:
312                 *reinterpret_cast<UINT32*>( pDst8 - 60 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 60 );
313             case 14:
314                 *reinterpret_cast<UINT32*>( pDst8 - 56 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 56 );
315             case 13:
316                 *reinterpret_cast<UINT32*>( pDst8 - 52 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 52 );
317             case 12:
318                 *reinterpret_cast<UINT32*>( pDst8 - 48 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 48 );
319             case 11:
320                 *reinterpret_cast<UINT32*>( pDst8 - 44 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 44 );
321             case 10:
322                 *reinterpret_cast<UINT32*>( pDst8 - 40 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 40 );
323             case 9:
324                 *reinterpret_cast<UINT32*>( pDst8 - 36 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 36 );
325             case 8:
326                 *reinterpret_cast<UINT32*>( pDst8 - 32 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 32 );
327             case 7:
328                 *reinterpret_cast<UINT32*>( pDst8 - 28 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 28 );
329             case 6:
330                 *reinterpret_cast<UINT32*>( pDst8 - 24 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 24 );
331             case 5:
332                 *reinterpret_cast<UINT32*>( pDst8 - 20 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 20 );
333             case 4:
334                 *reinterpret_cast<UINT32*>( pDst8 - 16 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 16 );
335             case 3:
336                 *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
337             case 2:
338                 *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
339             case 1:
340                 *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
341         }
342 
343         // tail may have up to 3 bytes off
344         if( bytesRemaining & 0x1 )
345         {
346             *pDst8 = *pSrc8;
347 
348             bytesRemaining -= 1;
349             pDst8 += 1;
350             pSrc8 += 1;
351         }
352 
353         if( bytesRemaining & 0x2 )
354         {
355             *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
356 
357             bytesRemaining -= 2;
358             pDst8 += 2;
359             pSrc8 += 2;
360         }
361     }
362 
363     // size is > 64 bytes use SSE2
364     else
365     {
366         __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // xmm registers
367 
368         // align the destination to 16 bytes if necessary
369         const size_t alignDst16 = reinterpret_cast<UINT_PTR>( pDst8 ) & TAIL_SIZE;
370         if( alignDst16 != 0 )
371         {
372             const size_t alignSize = 0x10 - alignDst16;
373 
374             // already aligned to 4 bytes previously, so remainder must be a multiple of 4
375             pDst8 += alignSize;
376             pSrc8 += alignSize;
377 
378             switch( alignSize / 4 )
379             {
380                 case 3:
381                     *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
382                 case 2:
383                     *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
384                 case 1:
385                     *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
386             }
387 
388             bytesRemaining -= alignSize;
389         }
390 
391         // if the size is greater than 1/2 largest cache
392         if( bytesRemaining > MIN_STREAM_SIZE )
393         {
394             while( bytesRemaining >= 128 )
395             {
396                 pDst8 += 128;
397                 pSrc8 += 128;
398                 bytesRemaining -= 128;
399 
400                 xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
401                 xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
402                 xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
403                 xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
404                 xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
405                 xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
406                 xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
407                 xmm7 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
408 
409                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
410                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
411                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
412                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
413                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
414                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
415                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
416                 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7);
417             }
418 
419             // copy up to 128 bytes
420             const size_t ptrAdvance = bytesRemaining & ~0xF;
421 
422             pDst8 += ptrAdvance;
423             pSrc8 += ptrAdvance;
424 
425             switch( bytesRemaining / 16 )
426             {
427                 case 7:
428                     xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
429                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
430                 case 6:
431                     xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
432                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
433                 case 5:
434                     xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
435                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
436                 case 4:
437                     xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
438                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
439                 case 3:
440                     xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
441                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
442                 case 2:
443                     xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
444                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
445                 case 1:
446                     xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
447                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
448             }
449 
450             bytesRemaining -= ptrAdvance;
451         }
452 
453         // size is less than 1/2 the largest cache, copy either fully aligned or partially aligned
454         else
455         {
456             const size_t alignSrc16 = reinterpret_cast<UINT_PTR>( pSrc8 ) & 0xF;
457 
458             // copy with source un-aligned
459             if( alignSrc16 != 0 )
460             {
461                 while( bytesRemaining >= 128 )
462                 {
463                     pDst8 += 128;
464                     pSrc8 += 128;
465                     bytesRemaining -= 128;
466 
467                     xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
468                     xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
469                     xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
470                     xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
471                     xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
472                     xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
473                     xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
474                     xmm7 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
475 
476                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
477                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
478                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
479                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
480                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
481                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
482                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
483                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7 );
484                 }
485 
486                 // copy up to 128 bytes
487                 const size_t ptrAdvance = bytesRemaining & ~0xF;
488 
489                 pDst8 += ptrAdvance;
490                 pSrc8 += ptrAdvance;
491 
492                 switch( bytesRemaining / 16 )
493                 {
494                     case 7:
495                         xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
496                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
497                     case 6:
498                         xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
499                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
500                     case 5:
501                         xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
502                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
503                     case 4:
504                         xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
505                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
506                     case 3:
507                         xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
508                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
509                     case 2:
510                         xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
511                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
512                     case 1:
513                         xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
514                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
515                 }
516 
517                 bytesRemaining -= ptrAdvance;
518             }
519 
520             // copy with source aligned
521             else
522             {
523                 while( bytesRemaining >= 128 )
524                 {
525                     pDst8 += 128;
526                     pSrc8 += 128;
527                     bytesRemaining -= 128;
528 
529                     xmm0 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
530                     xmm1 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
531                     xmm2 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
532                     xmm3 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
533                     xmm4 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
534                     xmm5 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
535                     xmm6 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
536                     xmm7 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
537 
538                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
539                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
540                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
541                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
542                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
543                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
544                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
545                     _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7 );
546                 }
547 
548                 // copy up to 128 bytes
549                 const size_t ptrAdvance = bytesRemaining & ~0xF;
550 
551                 pDst8 += ptrAdvance;
552                 pSrc8 += ptrAdvance;
553 
554                 switch( bytesRemaining / 16 )
555                 {
556                     case 7:
557                         xmm0 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
558                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
559                     case 6:
560                         xmm1 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
561                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
562                     case 5:
563                         xmm2 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
564                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
565                     case 4:
566                         xmm3 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
567                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
568                     case 3:
569                         xmm4 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
570                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
571                     case 2:
572                         xmm5 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
573                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
574                     case 1:
575                         xmm6 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
576                         _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
577                 }
578 
579                 bytesRemaining -= ptrAdvance;
580             }
581         }
582 
583         // copy the tail up to 15 bytes
584         if( bytesRemaining )
585         {
586             const size_t ptrAdvance = bytesRemaining & ~0x3;
587 
588             pDst8 += ptrAdvance;
589             pSrc8 += ptrAdvance;
590 
591             // copy last up to 12 bytes
592             switch( bytesRemaining / 4 )
593             {
594                 case 3:
595                     *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
596                 case 2:
597                     *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
598                 case 1:
599                     *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
600             }
601 
602             // copy last up to 3 bytes
603             if( bytesRemaining & 0x1 )
604             {
605                 *pDst8 = *pSrc8;
606 
607                 bytesRemaining -= 1;
608                 pDst8 += 1;
609                 pSrc8 += 1;
610             }
611 
612             if( bytesRemaining & 0x2 )
613             {
614                 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
615 
616                 bytesRemaining -= 2;
617                 pDst8 += 2;
618                 pSrc8 += 2;
619             }
620         }
621     }
622 #else // #if defined ( _MSC_VER )
623     // Linux projects do not support standard types or memcpy_s
624     ::memcpy_s(dst, bytes, src, bytes);
625 #endif
626 }
627 
628 /*****************************************************************************\
629 Inline Function:
630     MemCopyWC
631 
632 Description:
633     Memory copy to a destination that is un-cacheable, i.e host to gpu.
634 
635 Input:
636     dst - pointer to write-combined destination buffer
637     src - pointer to source buffer
638     bytes - number of bytes to copy
639 \*****************************************************************************/
MemCopyWC(void * dst,const void * src,const size_t bytes)640 inline void MemCopyWC( void* dst, const void* src, const size_t bytes )
641 {
642 #if defined ( _MSC_VER )
643     const __m128i           s_SSE2CmpMask   = _mm_setr_epi8( 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 );
644     const __m128i*          pMMSrc          = reinterpret_cast<const __m128i*>(src);
645     __m128i*                pMMDest         = reinterpret_cast<__m128i*>(dst);
646     size_t                  count           = bytes;
647     size_t                  cnt             = 0;
648     __m128i                 xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
649 
650     // if size > 16 align destination and move non-temporally
651     if (count >= INSTR_WIDTH_128)
652     {
653         // align destination to 16 if necessary
654         UINT32 align = (UINT32)((UINT_PTR)pMMDest & TAIL_SIZE);
655         if (align != 0)
656         {
657             // move alignment through a masked non-temporal move
658             const char* pSrc    = reinterpret_cast<const char*>(pMMSrc);
659             char*       pDst    = reinterpret_cast<char*>(pMMDest);
660 
661             align               = INSTR_WIDTH_128 - align;
662             char    shiftCnt    = (char)(INSTR_WIDTH_128 - align - 1);
663             __m128i shiftMask   = _mm_set1_epi8(shiftCnt);
664             __m128i mask        = _mm_cmpgt_epi8(s_SSE2CmpMask, shiftMask);
665             __m128i val         = _mm_loadu_si128(pMMSrc);
666             _mm_maskmoveu_si128(val, mask, pDst);
667 
668             pSrc += align;
669             pDst += align;
670 
671             pMMSrc  = reinterpret_cast<const __m128i*>(pSrc);
672             pMMDest = reinterpret_cast<__m128i*>(pDst);
673         }
674 
675         count -= align; // take off the alignment from size
676 
677         // check source alignment
678         if ((UINT_PTR)pMMSrc & TAIL_SIZE)
679         {
680             // copy un-aligned by tiers
681             cnt = count >> DUAL_CACHE_SHIFT;
682             for (UINT32 i = 0; i < cnt; i += 1)
683             {
684                 xmm0 = _mm_loadu_si128(pMMSrc);
685                 xmm1 = _mm_loadu_si128(pMMSrc + 1);
686                 xmm2 = _mm_loadu_si128(pMMSrc + 2);
687                 xmm3 = _mm_loadu_si128(pMMSrc + 3);
688                 xmm4 = _mm_loadu_si128(pMMSrc + 4);
689                 xmm5 = _mm_loadu_si128(pMMSrc + 5);
690                 xmm6 = _mm_loadu_si128(pMMSrc + 6);
691                 xmm7 = _mm_loadu_si128(pMMSrc + 7);
692                 pMMSrc += 8;
693 
694                 _mm_stream_si128(pMMDest, xmm0);
695                 _mm_stream_si128(pMMDest + 1, xmm1);
696                 _mm_stream_si128(pMMDest + 2, xmm2);
697                 _mm_stream_si128(pMMDest + 3, xmm3);
698                 _mm_stream_si128(pMMDest + 4, xmm4);
699                 _mm_stream_si128(pMMDest + 5, xmm5);
700                 _mm_stream_si128(pMMDest + 6, xmm6);
701                 _mm_stream_si128(pMMDest + 7, xmm7);
702                 pMMDest += 8;
703             }
704 
705             count &= TIERED_TAIL;
706             if (count != 0)
707             {
708                 cnt = count >> INSTR_128_SHIFT;
709                 for (UINT32 i = 0; i < cnt; i += 1)
710                 {
711                     xmm0 = _mm_loadu_si128(pMMSrc);
712                     pMMSrc += 1;
713                     _mm_stream_si128(pMMDest, xmm0);
714                     pMMDest += 1;
715                 }
716             }
717         }
718         else
719         {
720             // copy aligned by tiers
721             cnt = count >> DUAL_CACHE_SHIFT;
722             for (UINT32 i = 0; i < cnt; i += 1)
723             {
724                 xmm0 = _mm_load_si128(pMMSrc);
725                 xmm1 = _mm_load_si128(pMMSrc + 1);
726                 xmm2 = _mm_load_si128(pMMSrc + 2);
727                 xmm3 = _mm_load_si128(pMMSrc + 3);
728                 xmm4 = _mm_load_si128(pMMSrc + 4);
729                 xmm5 = _mm_load_si128(pMMSrc + 5);
730                 xmm6 = _mm_load_si128(pMMSrc + 6);
731                 xmm7 = _mm_load_si128(pMMSrc + 7);
732                 pMMSrc += 8;
733 
734                 _mm_stream_si128(pMMDest, xmm0);
735                 _mm_stream_si128(pMMDest + 1, xmm1);
736                 _mm_stream_si128(pMMDest + 2, xmm2);
737                 _mm_stream_si128(pMMDest + 3, xmm3);
738                 _mm_stream_si128(pMMDest + 4, xmm4);
739                 _mm_stream_si128(pMMDest + 5, xmm5);
740                 _mm_stream_si128(pMMDest + 6, xmm6);
741                 _mm_stream_si128(pMMDest + 7, xmm7);
742                 pMMDest += 8;
743             }
744 
745             count &= TIERED_TAIL;
746             if (count != 0)
747             {
748                 cnt = count >> INSTR_128_SHIFT;
749                 for (UINT32 i = 0; i < cnt; i += 1)
750                 {
751                     xmm0 = _mm_load_si128(pMMSrc);
752                     pMMSrc += 1;
753                     _mm_stream_si128(pMMDest, xmm0);
754                     pMMDest += 1;
755                 }
756             }
757         }
758     }
759 
760     // handle tail copy as a fallthrough
761     count &= TAIL_SIZE;
762     if (count != 0)
763     {
764         cnt                 = count >> DWORD_SHIFT;
765         DWORD*          pDst = reinterpret_cast<DWORD*>(pMMDest);
766         const DWORD*    pSrc = reinterpret_cast<const DWORD*>(pMMSrc);
767 
768         for (UINT32 i = 0; i < cnt; i += 1)
769         {
770             *pDst    = *pSrc;
771             pDst     += 1;
772             pSrc     += 1;
773         }
774 
775         cnt                 = count & BYTE_TAIL;
776         BYTE*       bDst    = reinterpret_cast<BYTE*>(pDst);
777         const BYTE* bSrc    = reinterpret_cast<const BYTE*>(pSrc);
778 
779         for (UINT32 i = 0; i < cnt; i += 1)
780         {
781             *bDst   = *bSrc;
782             bDst    += 1;
783             bSrc    += 1;
784         }
785     }
786 #else // #if defined ( _MSC_VER )
787     // Linux projects do not support standard types or memcpy_s
788     ::memcpy_s(dst, bytes, src, bytes);
789 #endif
790 }
791 
792 /*****************************************************************************\
793 Inline Function:
794     ScalarSwapBytes
795 
796 Description:
797     Helper function for MemCopySwapBytes
798 \*****************************************************************************/
ScalarSwapBytes(__m128i ** dst,const __m128i ** src,const size_t byteCount,const unsigned int swapbytes)799 inline void ScalarSwapBytes(
800     __m128i** dst,
801     const __m128i** src,
802     const size_t byteCount,
803     const unsigned int swapbytes)
804 {
805     switch (swapbytes)
806     {
807     case 2:
808         {
809             WORD*          wDst = reinterpret_cast<WORD*>(*dst);
810             const WORD*    wSrc = reinterpret_cast<const WORD*>(*src);
811 
812             for (UINT32 i = 0; i < byteCount / 2; i += 1)
813             {
814                 WORD tmp = *wSrc;
815                 *wDst    = (tmp >> 8) | (tmp << 8);
816                 wDst     += 1;
817                 wSrc     += 1;
818             }
819 
820             *src  = reinterpret_cast<const __m128i*>(wSrc);
821             *dst = reinterpret_cast<__m128i*>(wDst);
822         }
823         break;
824     case 4:
825         {
826             DWORD*          dwDst = reinterpret_cast<DWORD*>(*dst);
827             const DWORD*    dwSrc = reinterpret_cast<const DWORD*>(*src);
828 
829             for (UINT32 i = 0; i < byteCount / 4; i += 1)
830             {
831                 DWORD tmp = *dwSrc;
832                 *dwDst    = (tmp >> 24) | (tmp << 24) |
833                             ((tmp & 0x0000FF00) << 8) |
834                             ((tmp & 0x00FF0000) >> 8);
835                 dwDst     += 1;
836                 dwSrc     += 1;
837             }
838 
839             *src  = reinterpret_cast<const __m128i*>(dwSrc);
840             *dst = reinterpret_cast<__m128i*>(dwDst);
841         }
842         break;
843     default:
844         // should not occur
845         BYTE*               bDst = reinterpret_cast<BYTE*>(*dst);
846         const BYTE*         bSrc = reinterpret_cast<const BYTE*>(*src);
847 
848         ::memcpy_s(bDst, byteCount, bSrc, byteCount);
849 
850         *src = reinterpret_cast<const __m128i*>(bSrc + byteCount);
851         *dst = reinterpret_cast<__m128i*>(bDst + byteCount);
852     }
853 }
854 
855 /*****************************************************************************\
856 Inline Function:
857     MemCopySwapBytes
858 
859 Description:
860     Memory copy with swapped byte order, 2 and 4 byte elements only
861 
862 Input:
863     dst - pointer to write-combined destination buffer
864     src - pointer to source buffer
865     bytes - number of bytes to copy
866     swapbytes - granularity of elements to swap
867 \*****************************************************************************/
MemCopySwapBytes(void * dst,const void * src,const size_t bytes,const unsigned int swapbytes)868 inline void MemCopySwapBytes(
869     void* dst,
870     const void* src,
871     const size_t bytes,
872     const unsigned int swapbytes)
873 {
874     const __m128i*      pMMSrc  = reinterpret_cast<const __m128i*>(src);
875     __m128i*            pMMDest = reinterpret_cast<__m128i*>(dst);
876     size_t              count   = bytes;
877     size_t              cnt     = 0;
878     __m128i             xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
879 
880     // 2 byte shuffle
881     const __m128i       wordMask = _mm_setr_epi8(
882                             0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06,
883                             0x09, 0x08, 0x0b, 0x0a, 0x0d, 0x0c, 0x0f, 0x0e);
884 
885     // 4 byte shuffle
886     const __m128i       dwordMask = _mm_setr_epi8(
887                             0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04,
888                             0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c);
889 
890     // SSE3 support required
891     CPU_INSTRUCTION_LEVEL cpuInstructionLevel = GetCpuInstructionLevel();
892     if (cpuInstructionLevel < CPU_INSTRUCTION_LEVEL_SSE3)
893     {
894         ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
895         return;
896     }
897 
898     // only handle 2 and 4 bytes swapping
899     if (swapbytes != 2 && swapbytes != 4)
900     {
901         MemCopy(pMMDest, pMMSrc, count);
902         return;
903     }
904 
905     // when size is < 16 rely, must use scalar swap
906     if (count < INSTR_WIDTH_128)
907     {
908         ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
909     }
910     else
911     {
912         const __m128i shuffleMask = (swapbytes == 2) ? wordMask : dwordMask;
913 
914          // handle un-aligned tiered copy up to 2 cache lines
915         if (count < 2 * CACHE_LINE_SIZE)
916         {
917             cnt = count >> INSTR_128_SHIFT;
918             for (UINT32 i = 0; i < cnt; i += 1)
919             {
920                 xmm0 = _mm_loadu_si128(pMMSrc);
921                 pMMSrc += 1;
922                 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
923                 _mm_storeu_si128(pMMDest, xmm0);
924                 pMMDest += 1;
925             }
926         }
927         // handle aligned copy for > 2 cache lines
928         else
929         {
930             // align destination to 16 if necessary
931             UINT32 align = (UINT32)((UINT_PTR)pMMDest & TAIL_SIZE);
932             if (align != 0)
933             {
934                 align = INSTR_WIDTH_128 - align;
935                 cnt = align >> DWORD_SHIFT;
936                 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
937                 cnt = align & BYTE_TAIL;
938 
939                 // only words should remain, not bytes
940                 if (cnt > 0)
941                 {
942                     ASSERT(cnt % 2 == 0);
943                     ASSERT(swapbytes == 2);
944                     ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
945                 }
946             }
947 
948             count -= align; // take off the alignment from size
949 
950             // check source alignment
951             if ((UINT_PTR)pMMSrc & TAIL_SIZE)
952             {
953                 // copy un-aligned by tiers
954                 cnt = count >> DUAL_CACHE_SHIFT;
955                 for (UINT32 i = 0; i < cnt; i += 1)
956                 {
957                     xmm0 = _mm_loadu_si128(pMMSrc);
958                     xmm1 = _mm_loadu_si128(pMMSrc + 1);
959                     xmm2 = _mm_loadu_si128(pMMSrc + 2);
960                     xmm3 = _mm_loadu_si128(pMMSrc + 3);
961                     xmm4 = _mm_loadu_si128(pMMSrc + 4);
962                     xmm5 = _mm_loadu_si128(pMMSrc + 5);
963                     xmm6 = _mm_loadu_si128(pMMSrc + 6);
964                     xmm7 = _mm_loadu_si128(pMMSrc + 7);
965                     pMMSrc += 8;
966 
967                     xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
968                     xmm1 = _mm_shuffle_epi8(xmm1, shuffleMask);
969                     xmm2 = _mm_shuffle_epi8(xmm2, shuffleMask);
970                     xmm3 = _mm_shuffle_epi8(xmm3, shuffleMask);
971                     xmm4 = _mm_shuffle_epi8(xmm4, shuffleMask);
972                     xmm5 = _mm_shuffle_epi8(xmm5, shuffleMask);
973                     xmm6 = _mm_shuffle_epi8(xmm6, shuffleMask);
974                     xmm7 = _mm_shuffle_epi8(xmm7, shuffleMask);
975 
976                     _mm_store_si128(pMMDest, xmm0);
977                     _mm_store_si128(pMMDest + 1, xmm1);
978                     _mm_store_si128(pMMDest + 2, xmm2);
979                     _mm_store_si128(pMMDest + 3, xmm3);
980                     _mm_store_si128(pMMDest + 4, xmm4);
981                     _mm_store_si128(pMMDest + 5, xmm5);
982                     _mm_store_si128(pMMDest + 6, xmm6);
983                     _mm_store_si128(pMMDest + 7, xmm7);
984                     pMMDest += 8;
985                 }
986 
987                 count &= TIERED_TAIL;
988                 if (count != 0)
989                 {
990                     cnt = count >> INSTR_128_SHIFT;
991                     for (UINT32 i = 0; i < cnt; i += 1)
992                     {
993                         xmm0 = _mm_loadu_si128(pMMSrc);
994                         pMMSrc += 1;
995                         xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
996                         _mm_store_si128(pMMDest, xmm0);
997                         pMMDest += 1;
998                     }
999                 }
1000             }
1001             else
1002             {
1003                 // copy aligned by tiers
1004                 cnt = count >> DUAL_CACHE_SHIFT;
1005                 for (UINT32 i = 0; i < cnt; i += 1)
1006                 {
1007                     xmm0 = _mm_load_si128(pMMSrc);
1008                     xmm1 = _mm_load_si128(pMMSrc + 1);
1009                     xmm2 = _mm_load_si128(pMMSrc + 2);
1010                     xmm3 = _mm_load_si128(pMMSrc + 3);
1011                     xmm4 = _mm_load_si128(pMMSrc + 4);
1012                     xmm5 = _mm_load_si128(pMMSrc + 5);
1013                     xmm6 = _mm_load_si128(pMMSrc + 6);
1014                     xmm7 = _mm_load_si128(pMMSrc + 7);
1015                     pMMSrc += 8;
1016 
1017                     xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
1018                     xmm1 = _mm_shuffle_epi8(xmm1, shuffleMask);
1019                     xmm2 = _mm_shuffle_epi8(xmm2, shuffleMask);
1020                     xmm3 = _mm_shuffle_epi8(xmm3, shuffleMask);
1021                     xmm4 = _mm_shuffle_epi8(xmm4, shuffleMask);
1022                     xmm5 = _mm_shuffle_epi8(xmm5, shuffleMask);
1023                     xmm6 = _mm_shuffle_epi8(xmm6, shuffleMask);
1024                     xmm7 = _mm_shuffle_epi8(xmm7, shuffleMask);
1025 
1026                     _mm_store_si128(pMMDest, xmm0);
1027                     _mm_store_si128(pMMDest + 1, xmm1);
1028                     _mm_store_si128(pMMDest + 2, xmm2);
1029                     _mm_store_si128(pMMDest + 3, xmm3);
1030                     _mm_store_si128(pMMDest + 4, xmm4);
1031                     _mm_store_si128(pMMDest + 5, xmm5);
1032                     _mm_store_si128(pMMDest + 6, xmm6);
1033                     _mm_store_si128(pMMDest + 7, xmm7);
1034                     pMMDest += 8;
1035                 }
1036 
1037                 count &= TIERED_TAIL;
1038                 if (count != 0)
1039                 {
1040                     cnt = count >> INSTR_128_SHIFT;
1041                     for (UINT32 i = 0; i < cnt; i += 1)
1042                     {
1043                         xmm0 = _mm_load_si128(pMMSrc);
1044                         pMMSrc += 1;
1045                         xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
1046                         _mm_store_si128(pMMDest, xmm0);
1047                         pMMDest += 1;
1048                     }
1049                 }
1050             }
1051         }
1052 
1053         // handle tail copy as a fallthrough
1054         count &= TAIL_SIZE;
1055         if (count != 0)
1056         {
1057             cnt = count >> DWORD_SHIFT;
1058             ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
1059             cnt = count & BYTE_TAIL;
1060 
1061             // only words should remain, not bytes
1062             if (cnt > 0)
1063             {
1064                 ASSERT(cnt % 2 == 0);
1065                 ASSERT(swapbytes == 2);
1066                 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
1067             }
1068         }
1069     }
1070 }
1071 
1072 /*****************************************************************************\
1073 Inline Function:
1074     SafeMemSet
1075 
1076 Description:
1077     Exception Handler Memory Set function
1078 \*****************************************************************************/
SafeMemSet(void * dst,const int data,const size_t bytes)1079 inline void SafeMemSet( void* dst, const int data, const size_t bytes )
1080 {
1081 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1082     __try
1083 #endif
1084     {
1085         ::memset( dst, data, bytes );
1086     }
1087 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1088     // catch exceptions here so they are easily debugged
1089     __except(1)
1090     {
1091         ASSERT(0);
1092     }
1093 #endif
1094 }
1095 
1096 /*****************************************************************************\
1097 Inline Function:
1098     SafeMemCompare
1099 
1100 Description:
1101     Exception Handler Memory Compare function
1102 \*****************************************************************************/
SafeMemCompare(const void * dst,const void * src,const size_t bytes)1103 inline int SafeMemCompare( const void* dst, const void* src, const size_t bytes )
1104 {
1105 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1106     __try
1107 #endif
1108     {
1109         return ::memcmp( dst, src, bytes );
1110     }
1111 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1112     // catch exceptions here so they are easily debugged
1113     __except(1)
1114     {
1115         ASSERT(0);
1116         return -1;
1117     }
1118 #endif
1119 }
1120 
1121 /*****************************************************************************\
1122 Inline Function:
1123     SafeMemMove
1124 
1125 Description:
1126     copies "bytes" of data from src to dst.
1127     dst is not corrupted if src and dst blocks of data overlap.
1128 
1129 Input:
1130     dst   - pointer to destination buffer
1131     src   - pointer to source buffer
1132     bytes - number of bytes to copy
1133 \*****************************************************************************/
SafeMemMove(void * dst,const void * src,const size_t bytes)1134 inline void SafeMemMove( void *dst, const void *src, const size_t bytes )
1135 {
1136     if( dst!=src )
1137     {
1138         if( src>dst && bytes )
1139         {
1140             size_t t = 0;
1141             do
1142             {
1143                 static_cast< unsigned char* >( dst )[t] = static_cast< const unsigned char* >( src )[t];
1144             }
1145             while( ++t != bytes );
1146         }
1147         else
1148         {
1149             size_t t = bytes-1;
1150             do
1151             {
1152                 static_cast< unsigned char* >( dst )[t] = static_cast< const unsigned char* >( src )[t];
1153             }
1154             while( t-- != 0 );
1155         }
1156     }
1157 }
1158 
1159 /*****************************************************************************\
1160 MACROS:
1161     EMIT_R_MR
1162     Example:  movntdqa xmm1, xmmword ptr [eax]
1163 
1164     EMIT_R_MR_OFFSET
1165     Example: movntdqa xmm1, xmmword ptr [eax + 0x10]
1166 
1167 Description:
1168     Used to encode SSE4.1 instructions with parametrs
1169 \*****************************************************************************/
1170 #define EMIT_R_MR(OPCODE, X, Y )   \
1171     OPCODE                         \
1172     __asm _emit (0x00 + X*8 + Y)
1173 
1174 #define EMIT_R_MR_OFFSET(OPCODE, X, Y, OFFSET)  \
1175     OPCODE                                      \
1176     __asm _emit (0x80 + X*8 + Y)                \
1177     __asm _emit (OFFSET&0xFF)                   \
1178     __asm _emit ((OFFSET>>8)&0xFF)              \
1179     __asm _emit ((OFFSET>>16)&0xFF)             \
1180     __asm _emit ((OFFSET>>24)&0xFF)
1181 
1182 /*****************************************************************************\
1183 MACROS:
1184     REG_XXX
1185 
1186 Description:
1187     Define CPU General Purpose and XMM Register Indices
1188     These MACROS are to be replaced with instrinics available with .NET 2008
1189 \*****************************************************************************/
1190 #if defined( _MSC_VER )
1191 #define REG_EAX  0x00
1192 #define REG_ECX  0x01
1193 #define REG_EDX  0x02
1194 #define REG_EBX  0x03
1195 #define REG_ESP  0x04
1196 #define REG_EBP  0x05
1197 #define REG_ESI  0x06
1198 #define REG_EDI  0x07
1199 #define REG_XMM0 0x00
1200 #define REG_XMM1 0x01
1201 #define REG_XMM2 0x02
1202 #define REG_XMM3 0x03
1203 #define REG_XMM4 0x04
1204 #define REG_XMM5 0x05
1205 #define REG_XMM6 0x06
1206 #define REG_XMM7 0x07
1207 #endif //#if defined( _MSC_VER )
1208 
1209 /*****************************************************************************\
1210 MACROS:
1211     MOVNTDQA_OP
1212     MOVNTDQA_R_MR
1213     MOVNTDQA_R_MRB
1214 
1215 Description:
1216     Used to emit SSE4_1 movntdqa (streaming load) instructions
1217         SRC - XMM Register, destination data is to be stored
1218         DST - General Purpose Register containing source address
1219         OFFSET - Offset to be added to the source address
1220 \*****************************************************************************/
1221 #define MOVNTDQA_OP     \
1222     _asm _emit 0x66     \
1223     _asm _emit 0x0F     \
1224     _asm _emit 0x38     \
1225     _asm _emit 0x2A
1226 
1227 #define MOVNTDQA_R_MR(DST, SRC)                 \
1228     EMIT_R_MR(MOVNTDQA_OP, DST, SRC)
1229 
1230 #define MOVNTDQA_R_MR_OFFSET(DST, SRC, OFFSET)  \
1231     EMIT_R_MR_OFFSET(MOVNTDQA_OP, DST, SRC, OFFSET)
1232 
1233 /*****************************************************************************\
1234 Inline Function:
1235     FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa
1236 
1237 Description: Fast copy from USWC memory to cacheable system memory
1238 
1239 Input:
1240     dst - 16-byte aligned pointer to (cacheable) destination buffer
1241     src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
1242 \*****************************************************************************/
1243 #if defined( _MSC_VER ) && !defined (_WIN64)
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa(void * dst,const void * src)1244 __forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( void* dst, const void* src )
1245 {
1246 
1247     __asm
1248     {
1249         ;Store the orginal source start address
1250         mov edx, src
1251 
1252         ;Store the dest address
1253         mov ecx, dst
1254 
1255         align 16
1256 
1257         ; Load data from source buffer
1258         ; Streaming loads from the same cache line should be grouped together
1259         ; and not be interleaved with: a) Writes or non-streaming loads or
1260         ; b) Streaming loads from other cache lines (strided accesses)
1261 
1262         ; movntdqa xmm0, xmmword ptr [edx]
1263         MOVNTDQA_R_MR(REG_XMM0, REG_EDX)
1264 
1265         ; movntdqa xmm1, xmmword ptr [edx+16]
1266         MOVNTDQA_R_MR_OFFSET(REG_XMM1, REG_EDX, 16)
1267 
1268         ; movntdqa xmm2, xmmword ptr [edx+32]
1269         MOVNTDQA_R_MR_OFFSET(REG_XMM2, REG_EDX, 32)
1270 
1271         ; movntdqa xmm3, xmmword ptr [edx+48]
1272         MOVNTDQA_R_MR_OFFSET(REG_XMM3, REG_EDX, 48)
1273 
1274         ; Save data in destination buffer.
1275         movdqa xmmword ptr [ecx], xmm0
1276         movdqa xmmword ptr [ecx+16], xmm1
1277         movdqa xmmword ptr [ecx+32], xmm2
1278         movdqa xmmword ptr [ecx+48], xmm3
1279     }
1280 
1281 } // FastMemCopy_SSE4_1_movntdqa_movdqa()
1282 #endif //#if defined( _MSC_VER ) && !defined (_WIN64)
1283 
1284 /*****************************************************************************\
1285 Inline Function:
1286     FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu
1287 
1288 Description: Fast copy from USWC memory (DHWORD in size) to cacheable system memory
1289 
1290 Input:
1291     dst - 16-byte (unaligned) pointer to (cacheable) destination buffer
1292     src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
1293 \*****************************************************************************/
1294 #if defined ( _MSC_VER ) && !defined(_WIN64)
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void * dst,const void * src)1295 __forceinline void  __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src )
1296 {
1297     __asm
1298     {
1299         ;Store the orginal source start address
1300         mov edx, src
1301 
1302         ;Store the dest address
1303         mov ecx, dst
1304 
1305         align 16
1306 
1307         ; Load data from source buffer
1308         ; Streaming loads from the same cache line should be grouped together
1309         ; and not be interleaved with: a) Writes or non-streaming loads or
1310         ; b) Streaming loads from other cache lines (strided accesses)
1311 
1312         ; movntdqa xmm0, xmmword ptr [edx]
1313         MOVNTDQA_R_MR(REG_XMM0, REG_EDX)
1314 
1315         ; movntdqa xmm1, xmmword ptr [edx+16]
1316         MOVNTDQA_R_MR_OFFSET(REG_XMM1, REG_EDX, 16)
1317 
1318         ; movntdqa xmm2, xmmword ptr [edx+32]
1319         MOVNTDQA_R_MR_OFFSET(REG_XMM2, REG_EDX, 32)
1320 
1321         ; movntdqa xmm3, xmmword ptr [edx+48]
1322         MOVNTDQA_R_MR_OFFSET(REG_XMM3, REG_EDX, 48)
1323 
1324         ; Copy data in destination buffer.
1325         movdqu xmmword ptr [ecx], xmm0
1326         movdqu xmmword ptr [ecx+16], xmm1
1327         movdqu xmmword ptr [ecx+32], xmm2
1328         movdqu xmmword ptr [ecx+48], xmm3
1329     }
1330 } // FastMemCopy_SSE4_1_movntdqa_movdqu()
1331 #endif // #if defined( _MSC_VER ) && !defined (_WIN64)
1332 
1333 
FastMemCopyFromWC(void * dst,const void * src,const size_t bytes,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)1334 inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
1335 {
1336 #if defined( _MSC_VER ) && (!defined (_WIN64)  || defined ( _In_ ) ) || defined (__GNUC__)
1337     if( cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
1338     {
1339         // Cache pointers to memory
1340         BYTE* p_dst = (BYTE*)dst;
1341         BYTE* p_src = (BYTE*)src;
1342 
1343         size_t count = bytes;
1344 
1345         if( count >= sizeof(DHWORD) )
1346         {
1347             //Streaming Load must be 16-byte aligned but should
1348             //be 64-byte aligned for optimal performance
1349             const size_t doubleHexWordAlignBytes =
1350                 GetAlignmentOffset( p_src, sizeof(DHWORD) );
1351 
1352             // Copy portion of the source memory that is not aligned
1353             if( doubleHexWordAlignBytes )
1354             {
1355                 MemCopy( p_dst, p_src, doubleHexWordAlignBytes );
1356 
1357                 p_dst += doubleHexWordAlignBytes;
1358                 p_src += doubleHexWordAlignBytes;
1359                 count -= doubleHexWordAlignBytes;
1360             }
1361 
1362             ASSERT( IsAligned( p_src, sizeof(DHWORD) ) == true );
1363 
1364             // Get the number of bytes to be copied (rounded down to nearets DHWORD)
1365             const size_t DoubleHexWordsToCopy = count / sizeof(DHWORD);
1366 
1367             if( DoubleHexWordsToCopy )
1368             {
1369                 // Determine if the destination address is aligned
1370                 const bool isDstDoubleQuadWordAligned =
1371                     IsAligned( p_dst, sizeof(DQWORD) );
1372 
1373 #if defined(_WIN64) || defined(__GNUC__)
1374                 __m128i* pMMSrc = (__m128i*)(p_src);
1375                 __m128i* pMMDest = reinterpret_cast<__m128i*>(p_dst);
1376                 __m128i  xmm0, xmm1, xmm2, xmm3;
1377 #endif
1378 
1379                 if( isDstDoubleQuadWordAligned )
1380                 {
1381 #if defined(__GNUC__)
1382                     // Sync the WC memory data before issuing the MOVNTDQA instruction.
1383                     _mm_mfence();
1384 #endif
1385                     for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
1386                     {
1387 
1388 #if !defined(_WIN64) && !defined(__GNUC__)
1389                         FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( p_dst, p_src );
1390 #else
1391                         xmm0 = _mm_stream_load_si128(pMMSrc);
1392                         xmm1 = _mm_stream_load_si128(pMMSrc + 1);
1393                         xmm2 = _mm_stream_load_si128(pMMSrc + 2);
1394                         xmm3 = _mm_stream_load_si128(pMMSrc + 3);
1395                         pMMSrc += 4;
1396 
1397                         _mm_store_si128(pMMDest, xmm0);
1398                         _mm_store_si128(pMMDest + 1, xmm1);
1399                         _mm_store_si128(pMMDest + 2, xmm2);
1400                         _mm_store_si128(pMMDest + 3, xmm3);
1401                         pMMDest += 4;
1402 #endif
1403 
1404                         p_dst += sizeof(DHWORD);
1405                         p_src += sizeof(DHWORD);
1406                         count -= sizeof(DHWORD);
1407                     }
1408                 }
1409                 else
1410                 {
1411 #if defined(__GNUC__)
1412                     // Sync the WC memory data before issuing the MOVNTDQA instruction.
1413                     _mm_mfence();
1414 #endif
1415                     for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
1416                     {
1417 
1418 #if !defined(_WIN64) && !defined(__GNUC__)
1419                         FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu( p_dst, p_src );
1420 #else
1421                         xmm0 = _mm_stream_load_si128(pMMSrc);
1422                         xmm1 = _mm_stream_load_si128(pMMSrc + 1);
1423                         xmm2 = _mm_stream_load_si128(pMMSrc + 2);
1424                         xmm3 = _mm_stream_load_si128(pMMSrc + 3);
1425                         pMMSrc += 4;
1426 
1427                         _mm_storeu_si128(pMMDest, xmm0);
1428                         _mm_storeu_si128(pMMDest + 1, xmm1);
1429                         _mm_storeu_si128(pMMDest + 2, xmm2);
1430                         _mm_storeu_si128(pMMDest + 3, xmm3);
1431                         pMMDest += 4;
1432 #endif
1433 
1434                         p_dst += sizeof(DHWORD);
1435                         p_src += sizeof(DHWORD);
1436                         count -= sizeof(DHWORD);
1437                     }
1438                 }
1439             }
1440         }
1441 
1442         // Copy remaining BYTE(s)
1443         if( count )
1444         {
1445             MemCopy( p_dst, p_src, count );
1446         }
1447     }
1448     else
1449 #endif //!defined ( _WIN64 ) || defined ( _In_ )
1450     {
1451         MemCopy( dst, src, bytes );
1452     }
1453 }
1454 
1455 /*****************************************************************************\
1456 Inline Function:
1457     FastCpuBlt
1458 
1459 Description:
1460     Intel C++ Compiler CPU Blit function
1461 
1462 Parameters:
1463     BYTE* dst - destination pointer
1464     const DWORD dstPitch - pitch to increment destination pointer per count
1465     BYTE* src - source pointer
1466     const DWORD srcPitch - pitch to increment source pointer per count
1467     const DWORD stride - stride of data to copy per count, in bytes
1468     DWORD count - number of iterations to copy data
1469 
1470 \*****************************************************************************/
FastCpuBlt(BYTE * dst,const DWORD dstPitch,BYTE * src,const DWORD srcPitch,const DWORD stride,DWORD count)1471 inline void FastCpuBlt(
1472     BYTE* dst,
1473     const DWORD dstPitch,
1474     BYTE* src,
1475     const DWORD srcPitch,
1476     const DWORD stride,
1477     DWORD count )
1478 {
1479     do
1480     {
1481         MemCopy( dst, src, stride );
1482 
1483         dst += dstPitch;
1484         src += srcPitch;
1485     }
1486     while( --count > 0 );
1487 }
1488 
1489 /*****************************************************************************\
1490 Inline Function:
1491     FastCpuSet
1492 
1493 Description:
1494     Intel C++ Compiler CPU Blit function
1495 
1496 Parameters:
1497     BYTE* dst - destination pointer
1498     const DWORD dstPitch - pitch to increment destination pointer per count
1499     BYTE* src - source pointer
1500     const DWORD srcPitch - pitch to increment source pointer per count
1501     const DWORD stride - stride of data to copy per count, in bytes
1502     DWORD count - number of iterations to copy data
1503 
1504 \*****************************************************************************/
FastCpuSet(BYTE * dst,const DWORD dstPitch,const DWORD value,const DWORD stride,DWORD count)1505 inline void FastCpuSet(
1506     BYTE* dst,
1507     const DWORD dstPitch,
1508     const DWORD value,
1509     const DWORD stride,
1510     DWORD count )
1511 {
1512     do
1513     {
1514         SafeMemSet( dst, value, stride );
1515 
1516         dst += dstPitch;
1517     }
1518     while( --count > 0 );
1519 }
1520 
1521 /*****************************************************************************\
1522 Inline Function:
1523     FastCpuBltFromUSWC
1524 
1525 Description:
1526     Intel C++ Compiler CPU Blit function from non-temporal to temporal memory
1527     This function is optimized using SSE4 instructions which use accelerated write-combined
1528     loads that bypass the cache.
1529 
1530 Parameters:
1531     BYTE* dst - destination pointer (temporal)
1532     const DWORD dstPitch - pitch to increment destination pointer per count
1533     BYTE* src - source pointer (non-temporal)
1534     const DWORD srcPitch - pitch to increment source pointer per count
1535     const DWORD stride - stride of data to copy per count, in bytes
1536     DWORD count - number of iterations to copy data
1537     CPU_INSTRUCTION_LEVEL level - cpu instruction level (SSE support level)
1538 
1539 \*****************************************************************************/
1540 #if defined ( _MSC_VER )
FastCpuBltFromUSWC(BYTE * dst,const DWORD dstPitch,BYTE * src,const DWORD srcPitch,const DWORD stride,DWORD count,CPU_INSTRUCTION_LEVEL level)1541 inline void FastCpuBltFromUSWC(
1542     BYTE* dst,
1543     const DWORD dstPitch,
1544     BYTE* src,
1545     const DWORD srcPitch,
1546     const DWORD stride,
1547     DWORD count,
1548     CPU_INSTRUCTION_LEVEL level)
1549 {
1550 #ifndef _WIN64
1551 
1552     //back up the XMM registers just in case
1553      __declspec( align(16) ) BYTE backUpRegisters[16*4];
1554 
1555      void *tempPtr = (void *) backUpRegisters;
1556 
1557     __asm mov ecx, tempPtr
1558     __asm movdqa xmmword ptr [ecx + 16*0], xmm0
1559     __asm movdqa xmmword ptr [ecx + 16*1], xmm1
1560     __asm movdqa xmmword ptr [ecx + 16*2], xmm2
1561     __asm movdqa xmmword ptr [ecx + 16*3], xmm3
1562 
1563 #endif //_WIN64
1564     do
1565     {
1566         iSTD::FastMemCopyFromWC( dst, src, stride, level );
1567 
1568         dst += dstPitch;
1569         src += srcPitch;
1570     }
1571     while( --count > 0 );
1572 #ifndef _WIN64
1573 #if defined ( _MSC_VER )
1574     __asm mov ecx, tempPtr
1575     __asm movdqa xmm0, xmmword ptr [ecx + 16*0]
1576     __asm movdqa xmm1, xmmword ptr [ecx + 16*1]
1577     __asm movdqa xmm2, xmmword ptr [ecx + 16*2]
1578     __asm movdqa xmm3, xmmword ptr [ecx + 16*3]
1579 #endif
1580 #endif //_WIN64
1581 }
1582 #endif
1583 
1584 
1585 /*****************************************************************************\
1586 Inline Function:
1587     FindWordBufferMinMax
1588 
1589 Description:
1590     Finds the min and max unsigned 16-bit values in the buffer
1591 
1592 Input:
1593     WORD* pBuffer - pointer to 16-bit buffer
1594     const DWORD bytes - size of buffer in bytes
1595 
1596 Output:
1597     WORD &min - minimum 16-bit value
1598     WORD &max - maximum 16-bit value
1599 
1600 \*****************************************************************************/
FindWordBufferMinMax(WORD * pBuffer,const DWORD bytes,WORD & min,WORD & max)1601 inline void FindWordBufferMinMax(
1602     WORD* pBuffer,
1603     const DWORD bytes,
1604     WORD &min,
1605     WORD &max )
1606 {
1607     PrefetchBuffer( (BYTE*)pBuffer, bytes );
1608 
1609     WORD wValue = 0;
1610     WORD wMinValue = 0xffff;
1611     WORD wMaxValue = 0x0000;
1612 
1613     size_t count = bytes / sizeof(WORD);
1614     size_t i = 0;
1615 
1616     if( IsAligned( pBuffer, sizeof(WORD) ) )
1617     {
1618         const size_t DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
1619         const size_t WordsPerPrefetch            = sizeof(PREFETCH) / sizeof(WORD);
1620         const size_t WordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(WORD);
1621 
1622         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
1623         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1624 
1625         // Find min/max per cacheline of values
1626         if( count >= WordsPerDoubleQuadWord )
1627         {
1628             const size_t doubleQuadwordAlignWords =
1629                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
1630 
1631             // If pBuffer is not double-quadword aligned then process
1632             // until aligned
1633             if( doubleQuadwordAlignWords )
1634             {
1635                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
1636                 {
1637                     wValue = *pBuffer++;
1638 
1639                     wMinValue = Min( wMinValue, wValue );
1640                     wMaxValue = Max( wMaxValue, wValue );
1641                 }
1642 
1643                 count -= doubleQuadwordAlignWords;
1644             }
1645 
1646             // Find min/max per cacheline of values
1647             if( count >= WordsPerDoubleQuadWord )
1648             {
1649                 __m128i mValue128i;
1650 
1651                 // Need to convert unsigned values to signed values
1652                 // since min/max is signed op
1653                 __m128i mSignedScale128i = _mm_set1_epi16((WORD)0x8000);
1654 
1655                 // Signed min/max initialization
1656                 __m128i mMinValue128i    = _mm_set1_epi16(wMinValue-(WORD)0x8000);
1657                 __m128i mMaxValue128i    = _mm_set1_epi16(wMaxValue-(WORD)0x8000);
1658 
1659                 while( count >= WordsPerPrefetch )
1660                 {
1661                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1662 
1663                     // Process cacheline values per pass
1664                     count -= WordsPerPrefetch;
1665 
1666                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
1667                     {
1668                         // Get double-quadword values
1669                         mValue128i = *(__m128i*)pBuffer;
1670                         pBuffer += WordsPerDoubleQuadWord;
1671 
1672                         // Make values signed
1673                         mValue128i = _mm_sub_epi16( mValue128i,
1674                             mSignedScale128i );
1675 
1676                         // Determine parallel min/max
1677                         mMinValue128i = _mm_min_epi16( mMinValue128i,
1678                             mValue128i );
1679                         mMaxValue128i = _mm_max_epi16( mMaxValue128i,
1680                             mValue128i );
1681                     }
1682                 }
1683 
1684                 // Process double-quadword values per pass for remainder
1685                 while( count >= WordsPerDoubleQuadWord )
1686                 {
1687                     // Process double-quadword values per pass
1688                     count -= WordsPerDoubleQuadWord;
1689 
1690                     // Get double-quadword values
1691                     mValue128i = *(__m128i*)pBuffer;
1692                     pBuffer += WordsPerDoubleQuadWord;
1693 
1694                     // Make values signed
1695                     mValue128i = _mm_sub_epi16( mValue128i,
1696                         mSignedScale128i );
1697 
1698                     // Determine parallel min/max
1699                     mMinValue128i = _mm_min_epi16( mMinValue128i,
1700                         mValue128i );
1701                     mMaxValue128i = _mm_max_epi16( mMaxValue128i,
1702                         mValue128i );
1703                 }
1704 
1705                 // Determine wMinValue
1706 
1707                 // Make values unsigned
1708                 mMinValue128i = _mm_add_epi16( mMinValue128i,
1709                     mSignedScale128i );
1710 
1711                 // Extract each value in double-quadword to find minimum
1712                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
1713                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
1714                 wMinValue = Min( wMinValue, wValue );
1715                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
1716                 wMinValue = Min( wMinValue, wValue );
1717                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
1718                 wMinValue = Min( wMinValue, wValue );
1719                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
1720                 wMinValue = Min( wMinValue, wValue );
1721                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
1722                 wMinValue = Min( wMinValue, wValue );
1723                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
1724                 wMinValue = Min( wMinValue, wValue );
1725                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
1726                 wMinValue = Min( wMinValue, wValue );
1727                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
1728                 wMinValue = Min( wMinValue, wValue );
1729 
1730                 // Determine wMaxValue
1731 
1732                 // Make values unsigned
1733                 mMaxValue128i = _mm_add_epi16( mMaxValue128i,
1734                     mSignedScale128i );
1735 
1736                 // Extract each value in double-quadword to find maximum
1737                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
1738                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
1739                 wMaxValue = Max( wMaxValue, wValue );
1740                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
1741                 wMaxValue = Max( wMaxValue, wValue );
1742                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
1743                 wMaxValue = Max( wMaxValue, wValue );
1744                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
1745                 wMaxValue = Max( wMaxValue, wValue );
1746                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
1747                 wMaxValue = Max( wMaxValue, wValue );
1748                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
1749                 wMaxValue = Max( wMaxValue, wValue );
1750                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
1751                 wMaxValue = Max( wMaxValue, wValue );
1752                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
1753                 wMaxValue = Max( wMaxValue, wValue );
1754 
1755             } // if( count >= WordsPerDoubleQuadWord )
1756         } // if( count >= WordsPerDoubleQuadWord )
1757     }
1758 #ifndef _WIN64
1759     else // if( IsAligned( pBuffer, sizeof(WORD) ) )
1760     {
1761         const size_t QuadWordsPerCacheline   = sizeof(CACHELINE) / sizeof(QWORD);
1762         const size_t WordsPerCacheline       = sizeof(CACHELINE) / sizeof(WORD);
1763         const size_t WordsPerQuadWord        = sizeof(QWORD) / sizeof(WORD);
1764 
1765         Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
1766         Prefetch( (BYTE*)pBuffer + 2 * sizeof(CACHELINE) );
1767 
1768         if( count >= WordsPerQuadWord )
1769         {
1770             __m64   mValue64;
1771 
1772             // Need to convert unsigned values to signed values
1773             // since min/max is signed op
1774             __m64   mSignedScale64  = _mm_set1_pi16((WORD)0x8000);
1775 
1776             // Signed min/max initialization
1777             __m64   mMinValue64     = _mm_set1_pi16(wMinValue-(WORD)0x8000);
1778             __m64   mMaxValue64     = _mm_set1_pi16(wMaxValue-(WORD)0x8000);
1779 
1780             // Find min/max per cacheline of values
1781             while( count >= WordsPerCacheline )
1782             {
1783                 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
1784 
1785                 // Process cacheline values per pass
1786                 count -= WordsPerCacheline;
1787 
1788                 for( i = 0; i < QuadWordsPerCacheline; i++ )
1789                 {
1790                     // Get quadword values
1791                     mValue64 = *(__m64*)pBuffer;
1792                     pBuffer += WordsPerQuadWord;
1793 
1794                     // Make values signed
1795                     mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
1796 
1797                     // Determine parallel min/max
1798                     mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
1799                     mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
1800                 }
1801             }
1802 
1803             // Process quadword values per pass for remainder
1804             while( count >= WordsPerQuadWord )
1805             {
1806                 // Process quadword values per pass
1807                 count -= WordsPerQuadWord;
1808 
1809                 // Get quadword values
1810                 mValue64 = *(__m64*)pBuffer;
1811                 pBuffer += WordsPerQuadWord;
1812 
1813                 // Make values signed
1814                 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
1815 
1816                 // Determine parallel min/max
1817                 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
1818                 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
1819             }
1820 
1821             // Determine wMinValue
1822 
1823             // Make values unsigned
1824             mMinValue64 = _mm_add_pi16( mMinValue64, mSignedScale64 );
1825 
1826             // Extract each value in quadword to find minimum
1827             // for( i = 0; i < WordsPerQuadWord; i++ )
1828             wValue = (WORD)_mm_extract_pi16( mMinValue64, 0 );
1829             wMinValue = Min( wMinValue, wValue );
1830             wValue = (WORD)_mm_extract_pi16( mMinValue64, 1 );
1831             wMinValue = Min( wMinValue, wValue );
1832             wValue = (WORD)_mm_extract_pi16( mMinValue64, 2 );
1833             wMinValue = Min( wMinValue, wValue );
1834             wValue = (WORD)_mm_extract_pi16( mMinValue64, 3 );
1835             wMinValue = Min( wMinValue, wValue );
1836 
1837             // Determine wMaxValue
1838 
1839             // Make values unsigned
1840             mMaxValue64 = _mm_add_pi16( mMaxValue64, mSignedScale64 );
1841 
1842             // Extract each value in quadword to find maximum
1843             // for( i = 0; i < WordsPerQuadWord; i++ )
1844             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 0 );
1845             wMaxValue = Max( wMaxValue, wValue );
1846             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 1 );
1847             wMaxValue = Max( wMaxValue, wValue );
1848             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 2 );
1849             wMaxValue = Max( wMaxValue, wValue );
1850             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 3 );
1851             wMaxValue = Max( wMaxValue, wValue );
1852 
1853             _mm_empty();
1854 
1855         } // if( count >= WordsPerQuadWord )
1856     }
1857 #endif
1858 
1859     // Find min/max per value
1860     while( count > 0 )
1861     {
1862         count -= 1;
1863 
1864         wValue = *pBuffer++;
1865 
1866         wMinValue = Min( wMinValue, wValue );
1867         wMaxValue = Max( wMaxValue, wValue );
1868     }
1869 
1870     min = wMinValue;
1871     max = wMaxValue;
1872 }
1873 
1874 
1875 /*****************************************************************************\
1876 Inline Function:
1877     FindWordBufferMinMaxRestart
1878 
1879 Description:
1880     Finds the min and max unsigned 32-bit values in the buffer
1881     Excludes a restart value from min or max values
1882 
1883 Input:
1884     WORD* pBuffer - pointer to 32-bit buffer
1885     const DWORD bytes - size of buffer in bytes
1886     const WORD restart - restart index to ignore
1887     cpuInstructionLevel - indicates if SSE_4.1 is available
1888 
1889 Output:
1890     WORD &min - minimum 32-bit value
1891     WORD &max - maximum 32-bit value
1892 
1893 \*****************************************************************************/
FindWordBufferMinMaxRestart(WORD * pBuffer,const DWORD bytes,const WORD restart,WORD & min,WORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)1894 inline void FindWordBufferMinMaxRestart(
1895     WORD* pBuffer,
1896     const DWORD bytes,
1897     const WORD restart,
1898     WORD &min,
1899     WORD &max,
1900     CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
1901 {
1902 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
1903 
1904     WORD wValue = 0;
1905     WORD wMinValue = 0xffff;
1906     WORD wMaxValue = 0x0000;
1907 
1908     size_t count = bytes / sizeof(WORD);
1909 
1910 #ifdef USE_SSE4_1
1911 
1912     size_t i = 0;
1913 
1914     if( IsAligned( pBuffer, sizeof(WORD) ) &&
1915         cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
1916     {
1917         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
1918         const DWORD WordsPerPrefetch            = sizeof(PREFETCH) / sizeof(WORD);
1919         const DWORD WordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(WORD);
1920 
1921         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
1922         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1923 
1924         // Find min/max per cacheline of values
1925         if( count >= WordsPerDoubleQuadWord )
1926         {
1927             const size_t doubleQuadwordAlignWords =
1928                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
1929 
1930             // If pBuffer is not double-quadword aligned then process
1931             // until aligned
1932             if( doubleQuadwordAlignWords )
1933             {
1934                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
1935                 {
1936                     wValue = *pBuffer++;
1937 
1938                     if (wValue == restart) {
1939                         continue;
1940                     }
1941                     wMinValue = Min( wMinValue, wValue );
1942                     wMaxValue = Max( wMaxValue, wValue );
1943                 }
1944 
1945                 count -= doubleQuadwordAlignWords;
1946             }
1947 
1948             // Find min/max per cacheline of values
1949             if( count >= WordsPerDoubleQuadWord )
1950             {
1951                 __m128i mInput, mRestarts, mMask;
1952                 __m128i mAll_ones;
1953                 __m128i mMinValue128i, mMaxValue128i;
1954 
1955                 // This is just used for andnot mInput
1956                 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
1957                 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
1958 
1959                 // start with really high min and really low max
1960                 // What should happen if all values are restart?
1961                 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
1962                 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
1963                 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
1964                 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
1965 
1966                 // Initialize register used for testing for restart index.
1967                 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
1968                     (((UINT64) restart) << 48) |
1969                     (((UINT64) restart) << 32) |
1970                     (((UINT64) restart) << 16) |
1971                     ((UINT64) restart);
1972 
1973                 while( count >= WordsPerPrefetch )
1974                 {
1975                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1976 
1977                     // Process cacheline values per pass
1978                     count -= WordsPerPrefetch;
1979 
1980                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
1981                     {
1982                         // Get double-quadword values
1983                         mInput = *(__m128i*)pBuffer;
1984                         pBuffer += WordsPerDoubleQuadWord;
1985 
1986                         // Make mask of non-restart_index fields
1987                         mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
1988 
1989                         // Copy minimum and maximum fields for non-restarts
1990                         mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
1991                         mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
1992                     }
1993                 }
1994 
1995                 // Process double-quadword values per pass for remainder
1996                 while( count >= WordsPerDoubleQuadWord )
1997                 {
1998                     // Process double-quadword values per pass
1999                     count -= WordsPerDoubleQuadWord;
2000 
2001                     // Get double-quadword values
2002                     mInput = *(__m128i*)pBuffer;
2003                     pBuffer += WordsPerDoubleQuadWord;
2004 
2005                     // Make mask of non-restart_index fields
2006                     mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
2007 
2008                     // Copy minimum and maximum fields for non-restarts
2009                     mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
2010                     mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
2011                 }
2012 
2013                 // Determine wMinValue
2014 
2015                 // Extract each value in double-quadword to find minimum
2016                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2017                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
2018                 wMinValue = Min( wMinValue, wValue );
2019                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
2020                 wMinValue = Min( wMinValue, wValue );
2021                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
2022                 wMinValue = Min( wMinValue, wValue );
2023                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
2024                 wMinValue = Min( wMinValue, wValue );
2025                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
2026                 wMinValue = Min( wMinValue, wValue );
2027                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
2028                 wMinValue = Min( wMinValue, wValue );
2029                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
2030                 wMinValue = Min( wMinValue, wValue );
2031                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
2032                 wMinValue = Min( wMinValue, wValue );
2033 
2034                 // Determine wMaxValue
2035 
2036                 // Extract each value in double-quadword to find maximum
2037                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2038                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
2039                 wMaxValue = Max( wMaxValue, wValue );
2040                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
2041                 wMaxValue = Max( wMaxValue, wValue );
2042                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
2043                 wMaxValue = Max( wMaxValue, wValue );
2044                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
2045                 wMaxValue = Max( wMaxValue, wValue );
2046                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
2047                 wMaxValue = Max( wMaxValue, wValue );
2048                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
2049                 wMaxValue = Max( wMaxValue, wValue );
2050                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
2051                 wMaxValue = Max( wMaxValue, wValue );
2052                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
2053                 wMaxValue = Max( wMaxValue, wValue );
2054 
2055             } // if( count >= WordsPerDoubleQuadWord )
2056         } // if( count >= WordsPerDoubleQuadWord )
2057     }
2058 
2059 #endif // USE_SSE4_1
2060 
2061     // Find min/max per value
2062     while( count > 0 )
2063     {
2064         count -= 1;
2065 
2066         wValue = *pBuffer++;
2067 
2068         if (wValue == restart) {
2069             continue;
2070         }
2071         wMinValue = Min( wMinValue, wValue );
2072         wMaxValue = Max( wMaxValue, wValue );
2073     }
2074 
2075     min = wMinValue;
2076     max = wMaxValue;
2077 }
2078 
2079 
2080 /*****************************************************************************\
2081 Inline Function:
2082     FindDWordBufferMinMax
2083 
2084 Description:
2085     Finds the min and max unsigned 32-bit values in the buffer
2086 
2087 Input:
2088     DWORD* pBuffer - pointer to 32-bit buffer
2089     const DWORD bytes - size of buffer in bytes
2090 
2091 Output:
2092     DWORD &min - minimum 32-bit value
2093     DWORD &max - maximum 32-bit value
2094 
2095 \*****************************************************************************/
FindDWordBufferMinMax(DWORD * pBuffer,const DWORD bytes,DWORD & min,DWORD & max)2096 inline void FindDWordBufferMinMax(
2097     DWORD* pBuffer,
2098     const DWORD bytes,
2099     DWORD &min,
2100     DWORD &max )
2101 {
2102     PrefetchBuffer( (BYTE*)pBuffer, bytes );
2103 
2104     DWORD wValue = 0;
2105     DWORD wMinValue = 0xffffffff;
2106     DWORD wMaxValue = 0x00000000;
2107 
2108     DWORD count = bytes / sizeof(DWORD);
2109     DWORD i = 0;
2110 
2111     if( IsAligned( pBuffer, sizeof(DWORD) ) )
2112     {
2113         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
2114         const DWORD DWordsPerPrefetch            = sizeof(PREFETCH) / sizeof(DWORD);
2115         const DWORD DWordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(DWORD);
2116 
2117         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2118         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2119 
2120         // Find min/max per cacheline of values
2121         if( count >= DWordsPerDoubleQuadWord )
2122         {
2123             const DWORD doubleQuadwordAlignWords =
2124                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2125 
2126             // If pBuffer is not double-quadword aligned then process
2127             // until aligned
2128             if( doubleQuadwordAlignWords )
2129             {
2130                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2131                 {
2132                     wValue = *pBuffer++;
2133 
2134                     wMinValue = Min( wMinValue, wValue );
2135                     wMaxValue = Max( wMaxValue, wValue );
2136                 }
2137 
2138                 count -= doubleQuadwordAlignWords;
2139             }
2140 
2141             // Find min/max per cacheline of values
2142             if( count >= DWordsPerPrefetch )
2143             {
2144                 __m128i mValue128i;
2145                 __m128 mValue128;
2146 
2147                 // Signed min/max initialization
2148                 // need extra QWORD bits for SSE2 FP conversion
2149                 __m128  mMinValue128 = _mm_set1_ps( (float)( (QWORD)wMinValue  ) );
2150                 __m128  mMaxValue128 = _mm_set1_ps( (float)( wMaxValue  ) );
2151 
2152                 while( count >= DWordsPerPrefetch )
2153                 {
2154                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2155 
2156                     // Process cacheline values per pass
2157                     count -= DWordsPerPrefetch;
2158 
2159                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2160                     {
2161                         // Get double-quadword values
2162                         mValue128i = *(__m128i*)pBuffer;
2163                         pBuffer += DWordsPerDoubleQuadWord;
2164 
2165                         // Convert to FP
2166                         mValue128 = _mm_cvtepi32_ps( mValue128i );
2167 
2168                         // Determine parallel min/max
2169                         mMinValue128 = _mm_min_ps( mMinValue128,
2170                             mValue128 );
2171                         mMaxValue128 = _mm_max_ps( mMaxValue128,
2172                             mValue128 );
2173                     }
2174                 }
2175 
2176                 // Process double-quadword values per pass for remainder
2177                 while( count >= DWordsPerDoubleQuadWord )
2178                 {
2179                     // Process double-quadword values per pass
2180                     count -= DWordsPerDoubleQuadWord;
2181 
2182                     // Get double-quadword values
2183                     mValue128i = *(__m128i*)pBuffer;
2184                     pBuffer += DWordsPerDoubleQuadWord;
2185 
2186                     // Convert to FP
2187                     mValue128 = _mm_cvtepi32_ps( mValue128i );
2188 
2189                     // Determine parallel min/max
2190                     mMinValue128 = _mm_min_ps( mMinValue128,
2191                         mValue128 );
2192                     mMaxValue128 = _mm_max_ps( mMaxValue128,
2193                         mValue128 );
2194                 }
2195 
2196                 // Determine wMinValue
2197 
2198                 // Convert back to DWORD
2199                 __m128i mMinValue128i = _mm_cvtps_epi32( mMinValue128 );
2200 
2201                 // Extract each value in double-quadword to find minimum
2202                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2203                 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2204                 wMinValue = Min( wMinValue, wValue );
2205                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2206                 wValue = (DWORD)_mm_cvtsi128_si32(
2207                         _mm_srli_si128( mMinValue128i, 4 ) );
2208                 wMinValue = Min( wMinValue, wValue );
2209                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2210                 wValue = (DWORD)_mm_cvtsi128_si32(
2211                         _mm_srli_si128( mMinValue128i, 8 ) );
2212                 wMinValue = Min( wMinValue, wValue );
2213                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2214                 wValue = (DWORD)_mm_cvtsi128_si32(
2215                         _mm_srli_si128( mMinValue128i, 12 ) );
2216                 wMinValue = Min( wMinValue, wValue );
2217 
2218                 // Determine wMaxValue
2219 
2220                 // Convert back to DWORD
2221                 __m128i mMaxValue128i = _mm_cvtps_epi32( mMaxValue128 );
2222 
2223                 // Extract each value in double-quadword to find maximum
2224                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2225                 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2226                 wMaxValue = Max( wMaxValue, wValue );
2227                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2228                 wValue = (DWORD)_mm_cvtsi128_si32(
2229                         _mm_srli_si128( mMaxValue128i, 4 ) );
2230                 wMaxValue = Max( wMaxValue, wValue );
2231                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2232                 wValue = (DWORD)_mm_cvtsi128_si32(
2233                         _mm_srli_si128( mMaxValue128i, 8 ) );
2234                 wMaxValue = Max( wMaxValue, wValue );
2235                 // Grab element 3 from m128i reg:   3 | 2 | 1 | 0
2236                 wValue = (DWORD)_mm_cvtsi128_si32(
2237                         _mm_srli_si128( mMaxValue128i, 12 ) );
2238                 wMaxValue = Max( wMaxValue, wValue );
2239 
2240             } // if( count >= DWordsPerDoubleQuadWord )
2241         } // if( count >= DWordsPerDoubleQuadWord )
2242     }
2243 
2244     // Find min/max per value
2245     while( count > 0 )
2246     {
2247         count -= 1;
2248 
2249         wValue = *pBuffer++;
2250 
2251         wMinValue = Min( wMinValue, wValue );
2252         wMaxValue = Max( wMaxValue, wValue );
2253     }
2254 
2255     min = wMinValue;
2256     max = wMaxValue;
2257 }
2258 
2259 
2260 /*****************************************************************************\
2261 Inline Function:
2262     FindDWordBufferMinMaxRestart
2263 
2264 Description:
2265     Finds the min and max unsigned 32-bit values in the buffer
2266     Excludes a restart value from min or max values
2267 
2268 Input:
2269     DWORD* pBuffer - pointer to 32-bit buffer
2270     const DWORD bytes - size of buffer in bytes
2271     const DWORD restart - restart index to ignore
2272     cpuInstructionLevel - indicates if SSE_4.1 is available
2273 
2274 Output:
2275     DWORD &min - minimum 32-bit value
2276     DWORD &max - maximum 32-bit value
2277 
2278 \*****************************************************************************/
FindDWordBufferMinMaxRestart(DWORD * pBuffer,const DWORD bytes,const DWORD restart,DWORD & min,DWORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)2279 inline void FindDWordBufferMinMaxRestart(
2280     DWORD* pBuffer,
2281     const DWORD bytes,
2282     const DWORD restart,
2283     DWORD &min,
2284     DWORD &max,
2285     CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
2286 {
2287 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
2288 
2289     DWORD wValue = 0;
2290     DWORD wMinValue = 0xffffffff;
2291     DWORD wMaxValue = 0x00000000;
2292 
2293     DWORD count = bytes / sizeof(DWORD);
2294 
2295 #ifdef USE_SSE4_1
2296 
2297     DWORD i = 0;
2298 
2299     if( IsAligned( pBuffer, sizeof(DWORD) ) &&
2300         cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
2301     {
2302         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
2303         const DWORD DWordsPerPrefetch            = sizeof(PREFETCH) / sizeof(DWORD);
2304         const DWORD DWordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(DWORD);
2305 
2306         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2307         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2308 
2309         // Find min/max per cacheline of values
2310         if( count >= DWordsPerDoubleQuadWord )
2311         {
2312             const DWORD doubleQuadwordAlignWords =
2313                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2314 
2315             // If pBuffer is not double-quadword aligned then process
2316             // until aligned
2317             if( doubleQuadwordAlignWords )
2318             {
2319                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2320                 {
2321                     wValue = *pBuffer++;
2322 
2323                     if (wValue == restart) {
2324                         continue;
2325                     }
2326                     wMinValue = Min( wMinValue, wValue );
2327                     wMaxValue = Max( wMaxValue, wValue );
2328                 }
2329 
2330                 count -= doubleQuadwordAlignWords;
2331             }
2332 
2333             // Find min/max per cacheline of values
2334             if( count >= DWordsPerPrefetch )
2335             {
2336                 __m128i mInput, mRestarts, mMask;
2337                 __m128i mAll_ones;
2338                 __m128i mMinValue128i, mMaxValue128i;
2339 
2340                 // This is just used for andnot mInput
2341                 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
2342                 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
2343 
2344                 // start with really high min and really low max
2345                 // What should happen if all values are restart?
2346                 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
2347                 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
2348                 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
2349                 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
2350 
2351                 // Initialize register used for testing for restart index.
2352                 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
2353 
2354                 while( count >= DWordsPerPrefetch )
2355                 {
2356                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2357 
2358                     // Process cacheline values per pass
2359                     count -= DWordsPerPrefetch;
2360 
2361                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2362                     {
2363                       // Get double-quadword values
2364                         mInput = *(__m128i*)pBuffer;
2365                         pBuffer += DWordsPerDoubleQuadWord;
2366                        // Make mask of non-restart_index fields
2367                       mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
2368 
2369                         // Copy minimum and maximum fields for non-restarts
2370                         mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
2371                         mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
2372                     }
2373                 }
2374 
2375                 // Process double-quadword values per pass for remainder
2376                 while( count >= DWordsPerDoubleQuadWord )
2377                 {
2378                     // Process double-quadword values per pass
2379                     count -= DWordsPerDoubleQuadWord;
2380 
2381                     // Get double-quadword values
2382                     mInput = *(__m128i*)pBuffer;
2383                     pBuffer += DWordsPerDoubleQuadWord;
2384 
2385                     // Make mask of non-restart_index fields
2386                     mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
2387 
2388                     // Copy minimum and maximum fields for non-restarts
2389                     mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
2390                     mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
2391                 }
2392 
2393                 // Determine wMinValue
2394 
2395                 // Extract each value in double-quadword to find minimum
2396                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2397                 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2398                 wMinValue = Min( wMinValue, wValue );
2399                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2400                 wValue = (DWORD)_mm_cvtsi128_si32(
2401                         _mm_srli_si128( mMinValue128i, 4 ) );
2402                 wMinValue = Min( wMinValue, wValue );
2403                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2404                 wValue = (DWORD)_mm_cvtsi128_si32(
2405                         _mm_srli_si128( mMinValue128i, 8 ) );
2406                 wMinValue = Min( wMinValue, wValue );
2407                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2408                 wValue = (DWORD)_mm_cvtsi128_si32(
2409                         _mm_srli_si128( mMinValue128i, 12 ) );
2410                 wMinValue = Min( wMinValue, wValue );
2411                 // Determine wMaxValue
2412                 // Extract each value in double-quadword to find maximum
2413                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2414                 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2415                 wMaxValue = Max( wMaxValue, wValue );
2416                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2417                 wValue = (DWORD)_mm_cvtsi128_si32(
2418                         _mm_srli_si128( mMaxValue128i, 4 ) );
2419                 wMaxValue = Max( wMaxValue, wValue );
2420                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2421                 wValue = (DWORD)_mm_cvtsi128_si32(
2422                         _mm_srli_si128( mMaxValue128i, 8 ) );
2423                 wMaxValue = Max( wMaxValue, wValue );
2424                 // Grab element 3 from m128i reg:   3 | 2 | 1 | 0
2425                 wValue = (DWORD)_mm_cvtsi128_si32(
2426                         _mm_srli_si128( mMaxValue128i, 12 ) );
2427                 wMaxValue = Max( wMaxValue, wValue );
2428 
2429             } // if( count >= DWordsPerPrefetch )
2430         } // if( count >= DWordsPerDoubleQuadWord )
2431     }
2432 
2433 #endif // USE_SSE4_1
2434 
2435     // Find min/max per value
2436     while( count > 0 )
2437     {
2438         count -= 1;
2439 
2440         wValue = *pBuffer++;
2441 
2442         if (wValue == restart) {
2443             continue;
2444         }
2445         wMinValue = Min( wMinValue, wValue );
2446         wMaxValue = Max( wMaxValue, wValue );
2447     }
2448 
2449     min = wMinValue;
2450     max = wMaxValue;
2451 }
2452 
2453 
2454 
2455 /*****************************************************************************\
2456  Inline Function:
2457     FindWordBufferMinMaxCopy
2458 
2459 Description:
2460     Finds the min and max unsigned 16-bit values in the buffer
2461     Copies data from pBuffer to pDest at the same time
2462 
2463 Input:
2464     WORD* pDest - pointer to 16-bit buffer to copy into
2465     WORD* pBuffer - pointer to 16-bit index buffer
2466     const DWORD bytes - size of buffer in bytes
2467 
2468 Output:
2469     WORD &min - minimum 16-bit value
2470     WORD &max - maximum 16-bit value
2471 
2472 \*****************************************************************************/
FindWordBufferMinMaxCopy(WORD * pDest,WORD * pBuffer,const DWORD bytes,WORD & min,WORD & max)2473 inline void FindWordBufferMinMaxCopy(
2474     WORD* pDest,
2475     WORD* pBuffer,
2476     const DWORD bytes,
2477     WORD &min,
2478     WORD &max )
2479 {
2480 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
2481 
2482     WORD wValue = 0;
2483     WORD wMinValue = 0xffff;
2484     WORD wMaxValue = 0x0000;
2485 
2486     size_t count = bytes / sizeof(WORD);
2487     size_t i = 0;
2488 
2489     if( IsAligned( pBuffer, sizeof(WORD) ) )
2490     {
2491         const size_t DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
2492         const size_t WordsPerPrefetch            = sizeof(PREFETCH) / sizeof(WORD);
2493         const size_t WordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(WORD);
2494 
2495         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2496         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2497 
2498         // Find min/max per cacheline of values
2499         if( count >= WordsPerDoubleQuadWord )
2500         {
2501             const size_t doubleQuadwordAlignWords =
2502                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
2503 
2504             // If pBuffer is not double-quadword aligned then process
2505             // until aligned
2506             if( doubleQuadwordAlignWords )
2507             {
2508                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2509                 {
2510                     wValue = *pDest++ = *pBuffer++;
2511 
2512                     wMinValue = Min( wMinValue, wValue );
2513                     wMaxValue = Max( wMaxValue, wValue );
2514                 }
2515 
2516                 count -= doubleQuadwordAlignWords;
2517             }
2518 
2519             // Find min/max per cacheline of values
2520             if( count >= WordsPerDoubleQuadWord )
2521             {
2522                 __m128i mValue128i;
2523 
2524                 // Need to convert unsigned values to signed values
2525                 // since min/max is signed op
2526                 __m128i mSignedScale128i = _mm_set1_epi16((WORD)0x8000);
2527 
2528                 // Signed min/max initialization
2529                 __m128i mMinValue128i    = _mm_set1_epi16(wMinValue-(WORD)0x8000);
2530                 __m128i mMaxValue128i    = _mm_set1_epi16(wMaxValue-(WORD)0x8000);
2531 
2532                 while( count >= WordsPerPrefetch )
2533                 {
2534                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2535 
2536                     // Process cacheline values per pass
2537                     count -= WordsPerPrefetch;
2538 
2539                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2540                     {
2541                         // Get double-quadword values
2542                         mValue128i = *(__m128i*)pBuffer;
2543                         _mm_storeu_si128((__m128i*)pDest, mValue128i);
2544                         pBuffer += WordsPerDoubleQuadWord;
2545                         pDest += WordsPerDoubleQuadWord;
2546 
2547                         // Make values signed
2548                         mValue128i = _mm_sub_epi16( mValue128i,
2549                             mSignedScale128i );
2550 
2551                         // Determine parallel min/max
2552                         mMinValue128i = _mm_min_epi16( mMinValue128i,
2553                             mValue128i );
2554                         mMaxValue128i = _mm_max_epi16( mMaxValue128i,
2555                             mValue128i );
2556                     }
2557                 }
2558 
2559                 // Process double-quadword values per pass for remainder
2560                 while( count >= WordsPerDoubleQuadWord )
2561                 {
2562                     // Process double-quadword values per pass
2563                     count -= WordsPerDoubleQuadWord;
2564 
2565                     // Get double-quadword values
2566                     mValue128i = *(__m128i*)pBuffer;
2567                     _mm_storeu_si128((__m128i*)pDest, mValue128i);
2568                     pBuffer += WordsPerDoubleQuadWord;
2569                     pDest += WordsPerDoubleQuadWord;
2570 
2571                     // Make values signed
2572                     mValue128i = _mm_sub_epi16( mValue128i,
2573                         mSignedScale128i );
2574 
2575                     // Determine parallel min/max
2576                     mMinValue128i = _mm_min_epi16( mMinValue128i,
2577                         mValue128i );
2578                     mMaxValue128i = _mm_max_epi16( mMaxValue128i,
2579                         mValue128i );
2580                 }
2581 
2582                 // Determine wMinValue
2583 
2584                 // Make values unsigned
2585                 mMinValue128i = _mm_add_epi16( mMinValue128i,
2586                     mSignedScale128i );
2587 
2588                 // Extract each value in double-quadword to find minimum
2589                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2590                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
2591                 wMinValue = Min( wMinValue, wValue );
2592                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
2593                 wMinValue = Min( wMinValue, wValue );
2594                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
2595                 wMinValue = Min( wMinValue, wValue );
2596                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
2597                 wMinValue = Min( wMinValue, wValue );
2598                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
2599                 wMinValue = Min( wMinValue, wValue );
2600                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
2601                 wMinValue = Min( wMinValue, wValue );
2602                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
2603                 wMinValue = Min( wMinValue, wValue );
2604                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
2605                 wMinValue = Min( wMinValue, wValue );
2606 
2607                 // Determine wMaxValue
2608 
2609                 // Make values unsigned
2610                 mMaxValue128i = _mm_add_epi16( mMaxValue128i,
2611                     mSignedScale128i );
2612 
2613                 // Extract each value in double-quadword to find maximum
2614                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2615                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
2616                 wMaxValue = Max( wMaxValue, wValue );
2617                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
2618                 wMaxValue = Max( wMaxValue, wValue );
2619                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
2620                 wMaxValue = Max( wMaxValue, wValue );
2621                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
2622                 wMaxValue = Max( wMaxValue, wValue );
2623                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
2624                 wMaxValue = Max( wMaxValue, wValue );
2625                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
2626                 wMaxValue = Max( wMaxValue, wValue );
2627                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
2628                 wMaxValue = Max( wMaxValue, wValue );
2629                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
2630                 wMaxValue = Max( wMaxValue, wValue );
2631 
2632             } // if( count >= WordsPerDoubleQuadWord )
2633         } // if( count >= WordsPerDoubleQuadWord )
2634     }
2635 #ifndef _WIN64
2636     else // if( IsAligned( pBuffer, sizeof(WORD) ) )
2637     {
2638         const size_t QuadWordsPerCacheline   = sizeof(CACHELINE) / sizeof(QWORD);
2639         const size_t WordsPerCacheline       = sizeof(CACHELINE) / sizeof(WORD);
2640         const size_t WordsPerQuadWord        = sizeof(QWORD) / sizeof(WORD);
2641 
2642         Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
2643         Prefetch( (BYTE*)pBuffer + 2 * sizeof(CACHELINE) );
2644 
2645         if( count >= WordsPerQuadWord )
2646         {
2647             __m64   mValue64;
2648 
2649             // Need to convert unsigned values to signed values
2650             // since min/max is signed op
2651             __m64   mSignedScale64  = _mm_set1_pi16((WORD)0x8000);
2652 
2653             // Signed min/max initialization
2654             __m64   mMinValue64     = _mm_set1_pi16(wMinValue-(WORD)0x8000);
2655             __m64   mMaxValue64     = _mm_set1_pi16(wMaxValue-(WORD)0x8000);
2656 
2657             // Find min/max per cacheline of values
2658             while( count >= WordsPerCacheline )
2659             {
2660                 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
2661 
2662                 // Process cacheline values per pass
2663                 count -= WordsPerCacheline;
2664 
2665                 for( i = 0; i < QuadWordsPerCacheline; i++ )
2666                 {
2667                     // Get quadword values
2668                     mValue64 = *(__m64*)pBuffer;
2669                     *(__m64*)pDest = mValue64;
2670                     pBuffer += WordsPerQuadWord;
2671                     pDest += WordsPerQuadWord;
2672 
2673                     // Make values signed
2674                     mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
2675 
2676                     // Determine parallel min/max
2677                     mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
2678                     mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
2679                 }
2680             }
2681 
2682             // Process quadword values per pass for remainder
2683             while( count >= WordsPerQuadWord )
2684             {
2685                 // Process quadword values per pass
2686                 count -= WordsPerQuadWord;
2687 
2688                 // Get quadword values
2689                 mValue64 = *(__m64*)pBuffer;
2690                 *(__m64*)pDest = mValue64;
2691                 pBuffer += WordsPerQuadWord;
2692                 pDest += WordsPerQuadWord;
2693 
2694                 // Make values signed
2695                 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
2696 
2697                 // Determine parallel min/max
2698                 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
2699                 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
2700             }
2701 
2702             // Determine wMinValue
2703 
2704             // Make values unsigned
2705             mMinValue64 = _mm_add_pi16( mMinValue64, mSignedScale64 );
2706 
2707             // Extract each value in quadword to find minimum
2708             // for( i = 0; i < WordsPerQuadWord; i++ )
2709             wValue = (WORD)_mm_extract_pi16( mMinValue64, 0 );
2710             wMinValue = Min( wMinValue, wValue );
2711             wValue = (WORD)_mm_extract_pi16( mMinValue64, 1 );
2712             wMinValue = Min( wMinValue, wValue );
2713             wValue = (WORD)_mm_extract_pi16( mMinValue64, 2 );
2714             wMinValue = Min( wMinValue, wValue );
2715             wValue = (WORD)_mm_extract_pi16( mMinValue64, 3 );
2716             wMinValue = Min( wMinValue, wValue );
2717 
2718             // Determine wMaxValue
2719 
2720             // Make values unsigned
2721             mMaxValue64 = _mm_add_pi16( mMaxValue64, mSignedScale64 );
2722 
2723             // Extract each value in quadword to find maximum
2724             // for( i = 0; i < WordsPerQuadWord; i++ )
2725             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 0 );
2726             wMaxValue = Max( wMaxValue, wValue );
2727             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 1 );
2728             wMaxValue = Max( wMaxValue, wValue );
2729             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 2 );
2730             wMaxValue = Max( wMaxValue, wValue );
2731             wValue = (WORD)_mm_extract_pi16( mMaxValue64, 3 );
2732             wMaxValue = Max( wMaxValue, wValue );
2733 
2734             _mm_empty();
2735 
2736         } // if( count >= WordsPerQuadWord )
2737     }
2738 #endif
2739 
2740     // Find min/max per value
2741     while( count > 0 )
2742     {
2743         count -= 1;
2744 
2745         wValue = *pDest++ = *pBuffer++;
2746 
2747         wMinValue = Min( wMinValue, wValue );
2748         wMaxValue = Max( wMaxValue, wValue );
2749     }
2750 
2751     min = wMinValue;
2752     max = wMaxValue;
2753 }
2754 
2755 /*****************************************************************************\
2756 Inline Function:
2757     FindDWordBufferMinMaxCopy
2758 
2759 Description:
2760     Finds the min and max unsigned 32-bit values in the buffer
2761     Copies data from pBuffer to pDest at the same time
2762 
2763 Input:
2764     DWORD* pDest - pointer to 32-bit buffer to copy into
2765     DWORD* pBuffer - pointer to 32-bit buffer
2766     const DWORD bytes - size of buffer in bytes
2767 
2768 Output:
2769     WORD &min - minimum 32-bit value
2770     WORD &max - maximum 32-bit value
2771 
2772 \*****************************************************************************/
FindDWordBufferMinMaxCopy(DWORD * pDest,DWORD * pBuffer,const DWORD bytes,DWORD & min,DWORD & max)2773 inline void FindDWordBufferMinMaxCopy(
2774     DWORD* pDest,
2775     DWORD* pBuffer,
2776     const DWORD bytes,
2777     DWORD &min,
2778     DWORD &max )
2779 {
2780 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
2781 
2782     DWORD wValue = 0;
2783     DWORD wMinValue = 0xffffffff;
2784     DWORD wMaxValue = 0x00000000;
2785 
2786     DWORD count = bytes / sizeof(DWORD);
2787     DWORD i = 0;
2788 
2789     if( IsAligned( pBuffer, sizeof(DWORD) ) )
2790     {
2791         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
2792         const DWORD DWordsPerPrefetch            = sizeof(PREFETCH) / sizeof(DWORD);
2793         const DWORD DWordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(DWORD);
2794 
2795         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2796         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2797 
2798         // Find min/max per cacheline of values
2799         if( count >= DWordsPerDoubleQuadWord )
2800         {
2801             const DWORD doubleQuadwordAlignWords =
2802                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2803 
2804             // If pBuffer is not double-quadword aligned then process
2805             // until aligned
2806             if( doubleQuadwordAlignWords )
2807             {
2808                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2809                 {
2810                     wValue = *pDest++ = *pBuffer++;
2811 
2812                     wMinValue = Min( wMinValue, wValue );
2813                     wMaxValue = Max( wMaxValue, wValue );
2814                 }
2815 
2816                 count -= doubleQuadwordAlignWords;
2817             }
2818 
2819             // Find min/max per cacheline of values
2820             if( count >= DWordsPerDoubleQuadWord )
2821             {
2822                 __m128i mValue128i;
2823                 __m128 mValue128;
2824 
2825                 // Signed min/max initialization
2826                 // need extra QWORD bits for SSE2 FP conversion
2827                 __m128  mMinValue128 = _mm_set1_ps( (float)( (QWORD)wMinValue  ) );
2828                 __m128  mMaxValue128 = _mm_set1_ps( (float)( wMaxValue  ) );
2829 
2830                 while( count >= DWordsPerPrefetch )
2831                 {
2832                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2833 
2834                     // Process cacheline values per pass
2835                     count -= DWordsPerPrefetch;
2836 
2837                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2838                     {
2839                         // Get double-quadword values
2840                         mValue128i = *(__m128i*)pBuffer;
2841                         _mm_storeu_si128((__m128i*)pDest, mValue128i);
2842                         pBuffer += DWordsPerDoubleQuadWord;
2843                         pDest += DWordsPerDoubleQuadWord;
2844 
2845                         // Convert to FP
2846                         mValue128 = _mm_cvtepi32_ps( mValue128i );
2847 
2848                         // Determine parallel min/max
2849                         mMinValue128 = _mm_min_ps( mMinValue128,
2850                             mValue128 );
2851                         mMaxValue128 = _mm_max_ps( mMaxValue128,
2852                             mValue128 );
2853                     }
2854                 }
2855 
2856                 // Process double-quadword values per pass for remainder
2857                 while( count >= DWordsPerDoubleQuadWord )
2858                 {
2859                     // Process double-quadword values per pass
2860                     count -= DWordsPerDoubleQuadWord;
2861 
2862                     // Get double-quadword values
2863                     mValue128i = *(__m128i*)pBuffer;
2864                     _mm_storeu_si128((__m128i*)pDest, mValue128i);
2865                     pBuffer += DWordsPerDoubleQuadWord;
2866                     pDest += DWordsPerDoubleQuadWord;
2867 
2868                     // Convert to FP
2869                     mValue128 = _mm_cvtepi32_ps( mValue128i );
2870 
2871                     // Determine parallel min/max
2872                     mMinValue128 = _mm_min_ps( mMinValue128,
2873                         mValue128 );
2874                     mMaxValue128 = _mm_max_ps( mMaxValue128,
2875                         mValue128 );
2876                 }
2877 
2878                 // Determine wMinValue
2879 
2880                 // Convert back to DWORD
2881                 __m128i mMinValue128i = _mm_cvtps_epi32( mMinValue128 );
2882 
2883                 // Extract each value in double-quadword to find minimum
2884                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2885                 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2886                 wMinValue = Min( wMinValue, wValue );
2887                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2888                 wValue = (DWORD)_mm_cvtsi128_si32(
2889                         _mm_srli_si128( mMinValue128i, 4 ) );
2890                 wMinValue = Min( wMinValue, wValue );
2891                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2892                 wValue = (DWORD)_mm_cvtsi128_si32(
2893                         _mm_srli_si128( mMinValue128i, 8 ) );
2894                 wMinValue = Min( wMinValue, wValue );
2895                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2896                 wValue = (DWORD)_mm_cvtsi128_si32(
2897                         _mm_srli_si128( mMinValue128i, 12 ) );
2898                 wMinValue = Min( wMinValue, wValue );
2899 
2900                 // Determine wMaxValue
2901 
2902                 // Convert back to DWORD
2903                 __m128i mMaxValue128i = _mm_cvtps_epi32( mMaxValue128 );
2904 
2905                 // Extract each value in double-quadword to find maximum
2906                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
2907                 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2908                 wMaxValue = Max( wMaxValue, wValue );
2909                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
2910                 wValue = (DWORD)_mm_cvtsi128_si32(
2911                         _mm_srli_si128( mMaxValue128i, 4 ) );
2912                 wMaxValue = Max( wMaxValue, wValue );
2913                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
2914                 wValue = (DWORD)_mm_cvtsi128_si32(
2915                         _mm_srli_si128( mMaxValue128i, 8 ) );
2916                 wMaxValue = Max( wMaxValue, wValue );
2917                 // Grab element 3 from m128i reg:   3 | 2 | 1 | 0
2918                 wValue = (DWORD)_mm_cvtsi128_si32(
2919                         _mm_srli_si128( mMaxValue128i, 12 ) );
2920                 wMaxValue = Max( wMaxValue, wValue );
2921 
2922             } // if( count >= DWordsPerDoubleQuadWord )
2923         } // if( count >= DWordsPerDoubleQuadWord )
2924     }
2925 
2926     // Find min/max per value
2927     while( count > 0 )
2928     {
2929         count -= 1;
2930 
2931         wValue = *pDest++ = *pBuffer++;
2932 
2933         wMinValue = Min( wMinValue, wValue );
2934         wMaxValue = Max( wMaxValue, wValue );
2935     }
2936 
2937     min = wMinValue;
2938     max = wMaxValue;
2939 }
2940 
2941 
2942 /*****************************************************************************\
2943 Inline Function:
2944     FindWordBufferMinMaxRestartCoy
2945 
2946 Description:
2947     Finds the min and max unsigned 32-bit values in the buffer
2948     Excludes a restart value from min or max values
2949     Copies data from pBuffer to pDest at the same time
2950 
2951 Input:
2952     WORD* pDest - pointer to 32-bit buffer to copy into
2953     WORD* pBuffer - pointer to 32-bit buffer
2954     const DWORD bytes - size of buffer in bytes
2955     const WORD restart - restart index to ignore
2956     cpuInstructionLevel - indicates if SSE_4.1 is available
2957 
2958 Output:
2959     WORD &min - minimum 32-bit value
2960     WORD &max - maximum 32-bit value
2961 
2962 \*****************************************************************************/
FindWordBufferMinMaxRestartCopy(WORD * pDest,WORD * pBuffer,const DWORD bytes,const WORD restart,WORD & min,WORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)2963 inline void FindWordBufferMinMaxRestartCopy(
2964     WORD* pDest,
2965     WORD* pBuffer,
2966     const DWORD bytes,
2967     const WORD restart,
2968     WORD &min,
2969     WORD &max,
2970     CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
2971 {
2972 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
2973 
2974     WORD wValue = 0;
2975     WORD wMinValue = 0xffff;
2976     WORD wMaxValue = 0x0000;
2977 
2978     size_t count = bytes / sizeof(WORD);
2979 
2980 #ifdef USE_SSE4_1
2981 
2982     size_t i = 0;
2983 
2984     if( IsAligned( pBuffer, sizeof(WORD) ) &&
2985         cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
2986     {
2987         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
2988         const DWORD WordsPerPrefetch            = sizeof(PREFETCH) / sizeof(WORD);
2989         const DWORD WordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(WORD);
2990 
2991         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2992         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2993 
2994         // Find min/max per cacheline of values
2995         if( count >= WordsPerDoubleQuadWord )
2996         {
2997             const size_t doubleQuadwordAlignWords =
2998                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
2999 
3000             // If pBuffer is not double-quadword aligned then process
3001             // until aligned
3002             if( doubleQuadwordAlignWords )
3003             {
3004                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
3005                 {
3006                     wValue = *pDest++ = *pBuffer++;
3007 
3008                     if (wValue == restart) {
3009                         continue;
3010                     }
3011                     wMinValue = Min( wMinValue, wValue );
3012                     wMaxValue = Max( wMaxValue, wValue );
3013                 }
3014 
3015                 count -= doubleQuadwordAlignWords;
3016             }
3017 
3018             // Find min/max per cacheline of values
3019             if( count >= WordsPerDoubleQuadWord )
3020             {
3021                 __m128i mInput, mRestarts, mMask;
3022                 __m128i mAll_ones;
3023                 __m128i mMinValue128i, mMaxValue128i;
3024 
3025                 // This is just used for andnot mInput
3026                 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3027                 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3028 
3029                 // start with really high min and really low max
3030                 // What should happen if all values are restart?
3031                 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3032                 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3033                 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
3034                 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
3035 
3036                 // Initialize register used for testing for restart index.
3037                 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
3038                     (((UINT64) restart) << 48) |
3039                     (((UINT64) restart) << 32) |
3040                     (((UINT64) restart) << 16) |
3041                     ((UINT64) restart);
3042 
3043                 while( count >= WordsPerPrefetch )
3044                 {
3045                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3046 
3047                     // Process cacheline values per pass
3048                     count -= WordsPerPrefetch;
3049 
3050                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
3051                     {
3052                         // Get double-quadword values
3053                         mInput = *(__m128i*)pBuffer;
3054                         _mm_storeu_si128((__m128i*)pDest, mInput);
3055                         pBuffer += WordsPerDoubleQuadWord;
3056                         pDest += WordsPerDoubleQuadWord;
3057 
3058                         // Make mask of non-restart_index fields
3059                         mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
3060 
3061                         // Copy minimum and maximum fields for non-restarts
3062                         mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
3063                         mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
3064                     }
3065                 }
3066 
3067                 // Process double-quadword values per pass for remainder
3068                 while( count >= WordsPerDoubleQuadWord )
3069                 {
3070                     // Process double-quadword values per pass
3071                     count -= WordsPerDoubleQuadWord;
3072 
3073                     // Get double-quadword values
3074                     mInput = *(__m128i*)pBuffer;
3075                     _mm_storeu_si128((__m128i*)pDest, mInput);
3076                     pBuffer += WordsPerDoubleQuadWord;
3077                     pDest += WordsPerDoubleQuadWord;
3078 
3079                     // Make mask of non-restart_index fields
3080                     mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
3081 
3082                     // Copy minimum and maximum fields for non-restarts
3083                     mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
3084                     mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
3085                 }
3086 
3087                 // Determine wMinValue
3088 
3089                 // Extract each value in double-quadword to find minimum
3090                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
3091                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
3092                 wMinValue = Min( wMinValue, wValue );
3093                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
3094                 wMinValue = Min( wMinValue, wValue );
3095                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
3096                 wMinValue = Min( wMinValue, wValue );
3097                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
3098                 wMinValue = Min( wMinValue, wValue );
3099                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
3100                 wMinValue = Min( wMinValue, wValue );
3101                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
3102                 wMinValue = Min( wMinValue, wValue );
3103                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
3104                 wMinValue = Min( wMinValue, wValue );
3105                 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
3106                 wMinValue = Min( wMinValue, wValue );
3107 
3108                 // Determine wMaxValue
3109 
3110                 // Extract each value in double-quadword to find maximum
3111                 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
3112                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
3113                 wMaxValue = Max( wMaxValue, wValue );
3114                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
3115                 wMaxValue = Max( wMaxValue, wValue );
3116                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
3117                 wMaxValue = Max( wMaxValue, wValue );
3118                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
3119                 wMaxValue = Max( wMaxValue, wValue );
3120                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
3121                 wMaxValue = Max( wMaxValue, wValue );
3122                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
3123                 wMaxValue = Max( wMaxValue, wValue );
3124                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
3125                 wMaxValue = Max( wMaxValue, wValue );
3126                 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
3127                 wMaxValue = Max( wMaxValue, wValue );
3128 
3129             } // if( count >= WordsPerDoubleQuadWord )
3130         } // if( count >= WordsPerDoubleQuadWord )
3131     }
3132 
3133 #endif // USE_SSE4_1
3134 
3135     // Find min/max per value
3136     while( count > 0 )
3137     {
3138         count -= 1;
3139 
3140         wValue = *pDest++ = *pBuffer++;
3141 
3142         if (wValue == restart) {
3143             continue;
3144         }
3145         wMinValue = Min( wMinValue, wValue );
3146         wMaxValue = Max( wMaxValue, wValue );
3147     }
3148 
3149     min = wMinValue;
3150     max = wMaxValue;
3151 }
3152 
3153 
3154 /*****************************************************************************\
3155 Inline Function:
3156     FindDWordBufferMinMaxRestartCopy
3157 
3158 Description:
3159     Finds the min and max unsigned 32-bit values in the buffer
3160     Excludes a restart value from min or max values
3161     Copies data from pBuffer to pDest at the same time
3162 
3163 Input:
3164     DWORD* pDest - pointer to 32-bit buffer to copy into
3165     DWORD* pBuffer - pointer to 32-bit index buffer
3166     const DWORD bytes - size of buffer in bytes
3167     const DWORD restart - restart index to ignore
3168     cpuInstructionLevel - indicates if SSE_4.1 is available
3169 
3170 Output:
3171     DWORD &min - minimum 32-bit value
3172     DWORD &max - maximum 32-bit value
3173 
3174 \*****************************************************************************/
FindDWordBufferMinMaxRestartCopy(DWORD * pDest,DWORD * pBuffer,const DWORD bytes,const DWORD restart,DWORD & min,DWORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)3175 inline void FindDWordBufferMinMaxRestartCopy(
3176     DWORD* pDest,
3177     DWORD* pBuffer,
3178     const DWORD bytes,
3179     const DWORD restart,
3180     DWORD &min,
3181     DWORD &max,
3182     CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
3183 {
3184 //    PrefetchBuffer( (BYTE*)pBuffer, bytes );
3185 
3186     DWORD wValue = 0;
3187     DWORD wMinValue = 0xffffffff;
3188     DWORD wMaxValue = 0x00000000;
3189 
3190     DWORD count = bytes / sizeof(DWORD);
3191 
3192 #ifdef USE_SSE4_1
3193 
3194     DWORD i = 0;
3195 
3196     if( IsAligned( pBuffer, sizeof(DWORD) ) &&
3197         cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
3198     {
3199         const DWORD DoubleQuadWordsPerPrefetch  = sizeof(PREFETCH) / sizeof(DQWORD);
3200         const DWORD DWordsPerPrefetch            = sizeof(PREFETCH) / sizeof(DWORD);
3201         const DWORD DWordsPerDoubleQuadWord      = sizeof(DQWORD) / sizeof(DWORD);
3202 
3203         Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
3204         Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3205 
3206         // Find min/max per cacheline of values
3207         if( count >= DWordsPerDoubleQuadWord )
3208         {
3209             const DWORD doubleQuadwordAlignWords =
3210                 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
3211 
3212             // If pBuffer is not double-quadword aligned then process
3213             // until aligned
3214             if( doubleQuadwordAlignWords )
3215             {
3216                 for( i = 0; i < doubleQuadwordAlignWords; i++ )
3217                 {
3218                     wValue = *pDest++ = *pBuffer++;
3219 
3220                     if (wValue == restart) {
3221                         continue;
3222                     }
3223                     wMinValue = Min( wMinValue, wValue );
3224                     wMaxValue = Max( wMaxValue, wValue );
3225                 }
3226 
3227                 count -= doubleQuadwordAlignWords;
3228             }
3229 
3230             // Find min/max per cacheline of values
3231             if( count >= DWordsPerPrefetch )
3232             {
3233                 __m128i mInput, mRestarts, mMask;
3234                 __m128i mAll_ones;
3235                 __m128i mMinValue128i, mMaxValue128i;
3236 
3237                 // This is just used for andnot mInput
3238                 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3239                 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3240 
3241                 // start with really high min and really low max
3242                 // What should happen if all values are restart?
3243                 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3244                 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3245                 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
3246                 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
3247 
3248                 // Initialize register used for testing for restart index.
3249                 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
3250 
3251                 while( count >= DWordsPerPrefetch )
3252                 {
3253                     Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3254 
3255                     // Process cacheline values per pass
3256                     count -= DWordsPerPrefetch;
3257 
3258                     for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
3259                     {
3260                         // Get double-quadword values
3261                         mInput = *(__m128i*)pBuffer;
3262                         _mm_storeu_si128((__m128i*)pDest, mInput);
3263                         pBuffer += DWordsPerDoubleQuadWord;
3264                         pDest += DWordsPerDoubleQuadWord;
3265 
3266                         // Make mask of non-restart_index fields
3267                         mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
3268 
3269                         // Copy minimum and maximum fields for non-restarts
3270                         mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
3271                         mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
3272                     }
3273                 }
3274 
3275                 // Process double-quadword values per pass for remainder
3276                 while( count >= DWordsPerDoubleQuadWord )
3277                 {
3278                     // Process double-quadword values per pass
3279                     count -= DWordsPerDoubleQuadWord;
3280 
3281                     // Get double-quadword values
3282                     mInput = *(__m128i*)pBuffer;
3283                     _mm_storeu_si128((__m128i*)pDest, mInput);
3284                     pBuffer += DWordsPerDoubleQuadWord;
3285                     pDest += DWordsPerDoubleQuadWord;
3286 
3287                     // Make mask of non-restart_index fields
3288                     mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
3289 
3290                     // Copy minimum and maximum fields for non-restarts
3291                     mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
3292                     mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
3293                 }
3294 
3295                 // Determine wMinValue
3296 
3297                 // Extract each value in double-quadword to find minimum
3298                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
3299                 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
3300                 wMinValue = Min( wMinValue, wValue );
3301                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
3302                 wValue = (DWORD)_mm_cvtsi128_si32(
3303                         _mm_srli_si128( mMinValue128i, 4 ) );
3304                 wMinValue = Min( wMinValue, wValue );
3305                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
3306                 wValue = (DWORD)_mm_cvtsi128_si32(
3307                         _mm_srli_si128( mMinValue128i, 8 ) );
3308                 wMinValue = Min( wMinValue, wValue );
3309                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
3310                 wValue = (DWORD)_mm_cvtsi128_si32(
3311                         _mm_srli_si128( mMinValue128i, 12 ) );
3312                 wMinValue = Min( wMinValue, wValue );
3313 
3314                 // Determine wMaxValue
3315 
3316                 // Extract each value in double-quadword to find maximum
3317                 // Grab element 0 from m128i reg:   3 | 2 | 1 | 0
3318                 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
3319                 wMaxValue = Max( wMaxValue, wValue );
3320                 // Grab element 1 from m128i reg:   3 | 2 | 1 | 0
3321                 wValue = (DWORD)_mm_cvtsi128_si32(
3322                         _mm_srli_si128( mMaxValue128i, 4 ) );
3323                 wMaxValue = Max( wMaxValue, wValue );
3324                 // Grab element 2 from m128i reg:   3 | 2 | 1 | 0
3325                 wValue = (DWORD)_mm_cvtsi128_si32(
3326                         _mm_srli_si128( mMaxValue128i, 8 ) );
3327                 wMaxValue = Max( wMaxValue, wValue );
3328                 // Grab element 3 from m128i reg:   3 | 2 | 1 | 0
3329                 wValue = (DWORD)_mm_cvtsi128_si32(
3330                         _mm_srli_si128( mMaxValue128i, 12 ) );
3331                 wMaxValue = Max( wMaxValue, wValue );
3332 
3333             } // if( count >= DWordsPerPrefetch )
3334         } // if( count >= DWordsPerDoubleQuadWord )
3335     }
3336 
3337 #endif // USE_SSE4_1
3338 
3339     // Find min/max per value
3340     while( count > 0 )
3341     {
3342         count -= 1;
3343 
3344         wValue = *pDest++ = *pBuffer++;
3345 
3346         if (wValue == restart) {
3347             continue;
3348         }
3349         wMinValue = Min( wMinValue, wValue );
3350         wMaxValue = Max( wMaxValue, wValue );
3351     }
3352 
3353     min = wMinValue;
3354     max = wMaxValue;
3355 }
3356 
3357 
3358 } // iSTD
3359