1 /*========================== begin_copyright_notice ============================
2
3 Copyright (C) 2017-2021 Intel Corporation
4
5 SPDX-License-Identifier: MIT
6
7 ============================= end_copyright_notice ===========================*/
8
9 #pragma once
10
11 #include "types.h"
12 #include "Debug.h"
13 #include "utility.h"
14 #include <string.h>
15 #include "CpuUtil.h"
16
17 #if !defined ( _MSC_VER )
18 #include "inc/common/secure_mem.h"
19 #endif
20
21 #if defined(_WIN32)
22 #include <basetsd.h>
23 #if defined ( _WIN64 ) && defined ( _In_ )
24 // NOTE: <math.h> is not necessary here.
25 // This is only an ugly workaround for a VS2008 bug that causes the compilation
26 // issue on 64-bit DEBUG configuration.
27 // Including "math.h" before "intrin.h" helps to get rid of the following warning:
28 // warning C4985: 'ceil': attributes not present on previous declaration.
29 #include <math.h>
30 #endif
31 #include <intrin.h>
32 #define USE_SSE4_1
33 #else
34 #include <x86intrin.h>
35 #endif
36
37 typedef __m128 DQWORD; // 128-bits, 16-bytes
38 typedef DWORD PREFETCH[8]; // 32-bytes
39 typedef DWORD CACHELINE[8]; // 32-bytes
40 typedef WORD DHWORD[32]; // 512-bits, 64-bytes
41
42 namespace iSTD
43 {
44
45 enum
46 {
47 DWORD_SHIFT = 2,
48 BYTE_TAIL = 3,
49 INSTR_128_SHIFT = 4,
50 CACHE_LINE_SHIFT = 6,
51 DUAL_CACHE_SHIFT = 7,
52 TAIL_SIZE = 15,
53 INSTR_WIDTH_128 = 16,
54 INSTR_WIDTH_256 = 32,
55 CACHE_LINE_SIZE = 64,
56 TIERED_TAIL = 127,
57 DUAL_CACHE_SIZE = 128,
58 MIN_ERMSB_ALIGNED = 4096,
59 MIN_STREAM_SIZE = 524288,
60 };
61
62 #ifdef _WIN64
63 # define USE_INLINE_ASM 0
64 #else
65 # if defined _MSC_VER
66 # define USE_INLINE_ASM 1
67 # else
68 # define USE_INLINE_ASM 0
69 # endif
70 #endif
71
72 /*****************************************************************************\
73 Function Prototypes
74 \*****************************************************************************/
75 inline void Prefetch( const void* );
76 inline void PrefetchBuffer( const void*, const size_t );
77 inline void CachelineFlush( const void* );
78
79 template <size_t size>
80 inline void MemCopy( void*, const void* );
81 inline void MemCopy( void*, const void*, const size_t );
82 inline void MemCopyWC( void*, const void*, const size_t );
83 inline void MemCopySwapBytes( void*, const void*, const size_t, const unsigned int);
84 inline void ScalarSwapBytes( __m128i**, const __m128i**, const size_t, const unsigned int);
85
86 inline void SafeMemSet( void*, const int, const size_t );
87 inline int SafeMemCompare( const void*, const void*, const size_t );
88 inline void SafeMemMove( void*, const void*, const size_t );
89
90 #ifndef _WIN64
91 inline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa(void* dst, const void* src );
92 inline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src );
93 #endif
94 inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel);
95
96 inline void FastCpuBlt( BYTE*, const DWORD, BYTE*, const DWORD, const DWORD, DWORD );
97
98 inline void FindWordBufferMinMax( WORD*, const DWORD, WORD&, WORD& );
99 inline void FindDWordBufferMinMax( DWORD*, const DWORD, DWORD&, DWORD& );
100 inline void FindWordBufferMinMaxRestart( WORD*, const DWORD, const WORD, WORD&, WORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
101 inline void FindDWordBufferMinMaxRestart( DWORD*, const DWORD, const DWORD, DWORD&, DWORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
102
103 inline void FindWordBufferMinMaxCopy( WORD*, WORD*, const DWORD, WORD&, WORD& );
104 inline void FindDWordBufferMinMaxCopy( DWORD*, DWORD*, const DWORD, DWORD&, DWORD& );
105 inline void FindWordBufferMinMaxRestartCopy( WORD*, WORD*, const DWORD, const WORD, WORD&, WORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
106 inline void FindDWordBufferMinMaxRestartCopy( DWORD*, DWORD*, const DWORD, const DWORD, DWORD&, DWORD&, CPU_INSTRUCTION_LEVEL cpuInstructionLevel );
107
108 /*****************************************************************************\
109 Inline Function:
110 Prefetch
111
112 Description:
113 executes __asm prefetchnta
114 \*****************************************************************************/
Prefetch(const void * ptr)115 inline void Prefetch( const void* ptr )
116 {
117 _mm_prefetch( (const char*)ptr, _MM_HINT_NTA );
118 }
119
120 /*****************************************************************************\
121 Inline Function:
122 PrefetchBuffer
123
124 Description:
125 executes __asm prefetchnta
126 \*****************************************************************************/
PrefetchBuffer(const void * pBuffer,const size_t bytes)127 inline void PrefetchBuffer( const void* pBuffer, const size_t bytes )
128 {
129 const size_t cachelines = bytes / sizeof(PREFETCH);
130
131 for( size_t i = 0; i <= cachelines; i++ )
132 {
133 _mm_prefetch( (const char*)pBuffer + i * sizeof(PREFETCH),
134 _MM_HINT_NTA );
135 }
136 }
137
138 /*****************************************************************************\
139 Inline Function:
140 CachelineFlush
141
142 Description:
143 executes __asm clflush
144 \*****************************************************************************/
CachelineFlush(const void * ptr)145 inline void CachelineFlush( const void* ptr )
146 {
147 _mm_clflush( (char*)ptr );
148 }
149
150 /*****************************************************************************\
151 Inline Function:
152 MemCopy
153
154 Description:
155 Templated Exception Handler Memory Copy function
156 \*****************************************************************************/
157 template <size_t size>
MemCopy(void * dst,const void * src)158 inline void MemCopy( void* dst, const void* src )
159 {
160 MemCopy(dst, src, size);
161 }
162
163 template <>
164 inline void MemCopy<1>( void* dst, const void* src )
165 {
166 const BYTE* pSrc = reinterpret_cast<const BYTE*>(src);
167 BYTE* pDst = reinterpret_cast<BYTE*>(dst);
168 *pDst = *pSrc;
169 }
170
171 template <>
172 inline void MemCopy<2>( void* dst, const void* src )
173 {
174 const WORD* pSrc = reinterpret_cast<const WORD*>(src);
175 WORD* pDst = reinterpret_cast<WORD*>(dst);
176 *pDst = *pSrc;
177 }
178
179 template <>
180 inline void MemCopy<4>( void* dst, const void* src )
181 {
182 const UINT32* pSrc = reinterpret_cast<const UINT32*>(src);
183 UINT32* pDst = reinterpret_cast<UINT32*>(dst);
184 *pDst = *pSrc;
185 }
186
187 template <>
188 inline void MemCopy<8>( void* dst, const void* src )
189 {
190 const UINT64* pSrc = reinterpret_cast<const UINT64*>(src);
191 UINT64* pDst = reinterpret_cast<UINT64*>(dst);
192 *pDst = *pSrc;
193 }
194
195 template <>
196 inline void MemCopy<16>( void* dst, const void* src )
197 {
198 const __m128i* pMMSrc = reinterpret_cast<const __m128i*>(src);
199 __m128i* pMMDst = reinterpret_cast<__m128i*>(dst);
200 __m128i xmm0 = _mm_loadu_si128(pMMSrc);
201 _mm_storeu_si128(pMMDst, xmm0);
202 }
203
204 template <>
205 inline void MemCopy<28>( void* dst, const void* src )
206 {
207 const __m128i* pMMSrc = reinterpret_cast<const __m128i*>( src );
208 __m128i* pMMDst = reinterpret_cast<__m128i*>( dst );
209 __m128i xmm0 = _mm_loadu_si128( pMMSrc );
210 _mm_storeu_si128( pMMDst, xmm0 );
211
212 pMMSrc += 1;
213 pMMDst += 1;
214
215 const UINT64* pSrc64 = reinterpret_cast<const UINT64*>( pMMSrc );
216 UINT64* pDst64 = reinterpret_cast<UINT64*>( pMMDst );
217 *pDst64 = *pSrc64;
218
219 pDst64 += 1;
220 pSrc64 += 1;
221
222 const UINT32* pSrc32 = reinterpret_cast<const UINT32*>( pSrc64 );
223 UINT32* pDst32 = reinterpret_cast<UINT32*>( pDst64 );
224 *pDst32 = *pSrc32;
225 }
226
227 /*****************************************************************************\
228 Inline Function:
229 MemCopy
230
231 Description:
232 Exception Handler Memory Copy function
233 \*****************************************************************************/
MemCopy(void * dst,const void * src,const size_t bytes)234 inline void MemCopy( void* dst, const void* src, const size_t bytes )
235 {
236 #if defined ( _MSC_VER )
237 UINT8* pDst8 = reinterpret_cast<UINT8*>( dst );
238 const UINT8* pSrc8 = reinterpret_cast<const UINT8*>( src );
239 size_t bytesRemaining = bytes;
240
241 // handle invalid cases
242 if( bytesRemaining == 0 )
243 return;
244
245 // handle sizes <= 4 bytes
246 if( bytesRemaining <= 4 )
247 {
248 if( bytesRemaining == 1 )
249 {
250 // copy 1 bytes
251 *pDst8 = *pSrc8;
252 return;
253 }
254
255 if( bytesRemaining == 2 )
256 {
257 // copy 2 bytes
258 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
259 return;
260 }
261
262 if( bytesRemaining == 3 )
263 {
264 // copy 3 bytes
265 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
266 *( pDst8 + 2 ) = *( pSrc8 + 2 );
267 return;
268 }
269
270 *reinterpret_cast<UINT32*>( pDst8 ) = *reinterpret_cast<const UINT32*>( pSrc8 );
271 return;
272 }
273
274 // align destination to 4 byte boundary if size is > 8 bytes
275 if( bytesRemaining > 8 &&
276 reinterpret_cast<UINT_PTR>( pDst8 ) & 0x3 )
277 {
278 // check for shift by 1
279 if( reinterpret_cast<UINT_PTR>( pDst8 ) & 0x1 )
280 {
281 *pDst8 = *pSrc8;
282
283 bytesRemaining -= 1;
284 pDst8 += 1;
285 pSrc8 += 1;
286 }
287
288 // check for shift by 2
289 if( reinterpret_cast<UINT_PTR>( pDst8 ) & 0x2 )
290 {
291 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
292
293 bytesRemaining -= 2;
294 pDst8 += 2;
295 pSrc8 += 2;
296 }
297 }
298
299 // handle sizes <= 64 bytes as series of 4 byte moves
300 if( bytesRemaining <= CACHE_LINE_SIZE )
301 {
302 const size_t ptrAdvance = bytesRemaining & ~0x3; // TODO: Need to see if we can mimic the jump table
303
304 pDst8 += ptrAdvance;
305 pSrc8 += ptrAdvance;
306
307 switch( bytesRemaining / 4 )
308 {
309 case 16:
310 *reinterpret_cast<UINT32*>( pDst8 - 64 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 64 );
311 case 15:
312 *reinterpret_cast<UINT32*>( pDst8 - 60 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 60 );
313 case 14:
314 *reinterpret_cast<UINT32*>( pDst8 - 56 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 56 );
315 case 13:
316 *reinterpret_cast<UINT32*>( pDst8 - 52 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 52 );
317 case 12:
318 *reinterpret_cast<UINT32*>( pDst8 - 48 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 48 );
319 case 11:
320 *reinterpret_cast<UINT32*>( pDst8 - 44 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 44 );
321 case 10:
322 *reinterpret_cast<UINT32*>( pDst8 - 40 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 40 );
323 case 9:
324 *reinterpret_cast<UINT32*>( pDst8 - 36 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 36 );
325 case 8:
326 *reinterpret_cast<UINT32*>( pDst8 - 32 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 32 );
327 case 7:
328 *reinterpret_cast<UINT32*>( pDst8 - 28 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 28 );
329 case 6:
330 *reinterpret_cast<UINT32*>( pDst8 - 24 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 24 );
331 case 5:
332 *reinterpret_cast<UINT32*>( pDst8 - 20 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 20 );
333 case 4:
334 *reinterpret_cast<UINT32*>( pDst8 - 16 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 16 );
335 case 3:
336 *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
337 case 2:
338 *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
339 case 1:
340 *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
341 }
342
343 // tail may have up to 3 bytes off
344 if( bytesRemaining & 0x1 )
345 {
346 *pDst8 = *pSrc8;
347
348 bytesRemaining -= 1;
349 pDst8 += 1;
350 pSrc8 += 1;
351 }
352
353 if( bytesRemaining & 0x2 )
354 {
355 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
356
357 bytesRemaining -= 2;
358 pDst8 += 2;
359 pSrc8 += 2;
360 }
361 }
362
363 // size is > 64 bytes use SSE2
364 else
365 {
366 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7; // xmm registers
367
368 // align the destination to 16 bytes if necessary
369 const size_t alignDst16 = reinterpret_cast<UINT_PTR>( pDst8 ) & TAIL_SIZE;
370 if( alignDst16 != 0 )
371 {
372 const size_t alignSize = 0x10 - alignDst16;
373
374 // already aligned to 4 bytes previously, so remainder must be a multiple of 4
375 pDst8 += alignSize;
376 pSrc8 += alignSize;
377
378 switch( alignSize / 4 )
379 {
380 case 3:
381 *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
382 case 2:
383 *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
384 case 1:
385 *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
386 }
387
388 bytesRemaining -= alignSize;
389 }
390
391 // if the size is greater than 1/2 largest cache
392 if( bytesRemaining > MIN_STREAM_SIZE )
393 {
394 while( bytesRemaining >= 128 )
395 {
396 pDst8 += 128;
397 pSrc8 += 128;
398 bytesRemaining -= 128;
399
400 xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
401 xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
402 xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
403 xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
404 xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
405 xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
406 xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
407 xmm7 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
408
409 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
410 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
411 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
412 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
413 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
414 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
415 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
416 _mm_stream_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7);
417 }
418
419 // copy up to 128 bytes
420 const size_t ptrAdvance = bytesRemaining & ~0xF;
421
422 pDst8 += ptrAdvance;
423 pSrc8 += ptrAdvance;
424
425 switch( bytesRemaining / 16 )
426 {
427 case 7:
428 xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
429 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
430 case 6:
431 xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
432 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
433 case 5:
434 xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
435 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
436 case 4:
437 xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
438 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
439 case 3:
440 xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
441 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
442 case 2:
443 xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
444 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
445 case 1:
446 xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
447 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
448 }
449
450 bytesRemaining -= ptrAdvance;
451 }
452
453 // size is less than 1/2 the largest cache, copy either fully aligned or partially aligned
454 else
455 {
456 const size_t alignSrc16 = reinterpret_cast<UINT_PTR>( pSrc8 ) & 0xF;
457
458 // copy with source un-aligned
459 if( alignSrc16 != 0 )
460 {
461 while( bytesRemaining >= 128 )
462 {
463 pDst8 += 128;
464 pSrc8 += 128;
465 bytesRemaining -= 128;
466
467 xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
468 xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
469 xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
470 xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
471 xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
472 xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
473 xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
474 xmm7 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
475
476 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
477 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
478 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
479 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
480 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
481 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
482 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
483 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7 );
484 }
485
486 // copy up to 128 bytes
487 const size_t ptrAdvance = bytesRemaining & ~0xF;
488
489 pDst8 += ptrAdvance;
490 pSrc8 += ptrAdvance;
491
492 switch( bytesRemaining / 16 )
493 {
494 case 7:
495 xmm0 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
496 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
497 case 6:
498 xmm1 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
499 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
500 case 5:
501 xmm2 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
502 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
503 case 4:
504 xmm3 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
505 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
506 case 3:
507 xmm4 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
508 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
509 case 2:
510 xmm5 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
511 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
512 case 1:
513 xmm6 = _mm_loadu_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
514 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
515 }
516
517 bytesRemaining -= ptrAdvance;
518 }
519
520 // copy with source aligned
521 else
522 {
523 while( bytesRemaining >= 128 )
524 {
525 pDst8 += 128;
526 pSrc8 += 128;
527 bytesRemaining -= 128;
528
529 xmm0 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 128 ));
530 xmm1 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
531 xmm2 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
532 xmm3 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
533 xmm4 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
534 xmm5 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
535 xmm6 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
536 xmm7 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
537
538 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 128 ), xmm0 );
539 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm1 );
540 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm2 );
541 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm3 );
542 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm4 );
543 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm5 );
544 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm6 );
545 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm7 );
546 }
547
548 // copy up to 128 bytes
549 const size_t ptrAdvance = bytesRemaining & ~0xF;
550
551 pDst8 += ptrAdvance;
552 pSrc8 += ptrAdvance;
553
554 switch( bytesRemaining / 16 )
555 {
556 case 7:
557 xmm0 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 112 ));
558 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 112 ), xmm0 );
559 case 6:
560 xmm1 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 96 ));
561 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 96 ), xmm1 );
562 case 5:
563 xmm2 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 80 ));
564 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 80 ), xmm2 );
565 case 4:
566 xmm3 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 64 ));
567 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 64 ), xmm3 );
568 case 3:
569 xmm4 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 48 ));
570 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 48 ), xmm4 );
571 case 2:
572 xmm5 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 32 ));
573 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 32 ), xmm5 );
574 case 1:
575 xmm6 = _mm_load_si128( reinterpret_cast<const __m128i*>( pSrc8 - 16 ));
576 _mm_store_si128( reinterpret_cast<__m128i*>( pDst8 - 16 ), xmm6 );
577 }
578
579 bytesRemaining -= ptrAdvance;
580 }
581 }
582
583 // copy the tail up to 15 bytes
584 if( bytesRemaining )
585 {
586 const size_t ptrAdvance = bytesRemaining & ~0x3;
587
588 pDst8 += ptrAdvance;
589 pSrc8 += ptrAdvance;
590
591 // copy last up to 12 bytes
592 switch( bytesRemaining / 4 )
593 {
594 case 3:
595 *reinterpret_cast<UINT32*>( pDst8 - 12 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 12 );
596 case 2:
597 *reinterpret_cast<UINT32*>( pDst8 - 8 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 8 );
598 case 1:
599 *reinterpret_cast<UINT32*>( pDst8 - 4 ) = *reinterpret_cast<const UINT32*>( pSrc8 - 4 );
600 }
601
602 // copy last up to 3 bytes
603 if( bytesRemaining & 0x1 )
604 {
605 *pDst8 = *pSrc8;
606
607 bytesRemaining -= 1;
608 pDst8 += 1;
609 pSrc8 += 1;
610 }
611
612 if( bytesRemaining & 0x2 )
613 {
614 *reinterpret_cast<UINT16*>( pDst8 ) = *reinterpret_cast<const UINT16*>( pSrc8 );
615
616 bytesRemaining -= 2;
617 pDst8 += 2;
618 pSrc8 += 2;
619 }
620 }
621 }
622 #else // #if defined ( _MSC_VER )
623 // Linux projects do not support standard types or memcpy_s
624 ::memcpy_s(dst, bytes, src, bytes);
625 #endif
626 }
627
628 /*****************************************************************************\
629 Inline Function:
630 MemCopyWC
631
632 Description:
633 Memory copy to a destination that is un-cacheable, i.e host to gpu.
634
635 Input:
636 dst - pointer to write-combined destination buffer
637 src - pointer to source buffer
638 bytes - number of bytes to copy
639 \*****************************************************************************/
MemCopyWC(void * dst,const void * src,const size_t bytes)640 inline void MemCopyWC( void* dst, const void* src, const size_t bytes )
641 {
642 #if defined ( _MSC_VER )
643 const __m128i s_SSE2CmpMask = _mm_setr_epi8( 0x0F, 0x0E, 0x0D, 0x0C, 0x0B, 0x0A, 0x09, 0x08, 0x07, 0x06, 0x05, 0x04, 0x03, 0x02, 0x01, 0x00 );
644 const __m128i* pMMSrc = reinterpret_cast<const __m128i*>(src);
645 __m128i* pMMDest = reinterpret_cast<__m128i*>(dst);
646 size_t count = bytes;
647 size_t cnt = 0;
648 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
649
650 // if size > 16 align destination and move non-temporally
651 if (count >= INSTR_WIDTH_128)
652 {
653 // align destination to 16 if necessary
654 UINT32 align = (UINT32)((UINT_PTR)pMMDest & TAIL_SIZE);
655 if (align != 0)
656 {
657 // move alignment through a masked non-temporal move
658 const char* pSrc = reinterpret_cast<const char*>(pMMSrc);
659 char* pDst = reinterpret_cast<char*>(pMMDest);
660
661 align = INSTR_WIDTH_128 - align;
662 char shiftCnt = (char)(INSTR_WIDTH_128 - align - 1);
663 __m128i shiftMask = _mm_set1_epi8(shiftCnt);
664 __m128i mask = _mm_cmpgt_epi8(s_SSE2CmpMask, shiftMask);
665 __m128i val = _mm_loadu_si128(pMMSrc);
666 _mm_maskmoveu_si128(val, mask, pDst);
667
668 pSrc += align;
669 pDst += align;
670
671 pMMSrc = reinterpret_cast<const __m128i*>(pSrc);
672 pMMDest = reinterpret_cast<__m128i*>(pDst);
673 }
674
675 count -= align; // take off the alignment from size
676
677 // check source alignment
678 if ((UINT_PTR)pMMSrc & TAIL_SIZE)
679 {
680 // copy un-aligned by tiers
681 cnt = count >> DUAL_CACHE_SHIFT;
682 for (UINT32 i = 0; i < cnt; i += 1)
683 {
684 xmm0 = _mm_loadu_si128(pMMSrc);
685 xmm1 = _mm_loadu_si128(pMMSrc + 1);
686 xmm2 = _mm_loadu_si128(pMMSrc + 2);
687 xmm3 = _mm_loadu_si128(pMMSrc + 3);
688 xmm4 = _mm_loadu_si128(pMMSrc + 4);
689 xmm5 = _mm_loadu_si128(pMMSrc + 5);
690 xmm6 = _mm_loadu_si128(pMMSrc + 6);
691 xmm7 = _mm_loadu_si128(pMMSrc + 7);
692 pMMSrc += 8;
693
694 _mm_stream_si128(pMMDest, xmm0);
695 _mm_stream_si128(pMMDest + 1, xmm1);
696 _mm_stream_si128(pMMDest + 2, xmm2);
697 _mm_stream_si128(pMMDest + 3, xmm3);
698 _mm_stream_si128(pMMDest + 4, xmm4);
699 _mm_stream_si128(pMMDest + 5, xmm5);
700 _mm_stream_si128(pMMDest + 6, xmm6);
701 _mm_stream_si128(pMMDest + 7, xmm7);
702 pMMDest += 8;
703 }
704
705 count &= TIERED_TAIL;
706 if (count != 0)
707 {
708 cnt = count >> INSTR_128_SHIFT;
709 for (UINT32 i = 0; i < cnt; i += 1)
710 {
711 xmm0 = _mm_loadu_si128(pMMSrc);
712 pMMSrc += 1;
713 _mm_stream_si128(pMMDest, xmm0);
714 pMMDest += 1;
715 }
716 }
717 }
718 else
719 {
720 // copy aligned by tiers
721 cnt = count >> DUAL_CACHE_SHIFT;
722 for (UINT32 i = 0; i < cnt; i += 1)
723 {
724 xmm0 = _mm_load_si128(pMMSrc);
725 xmm1 = _mm_load_si128(pMMSrc + 1);
726 xmm2 = _mm_load_si128(pMMSrc + 2);
727 xmm3 = _mm_load_si128(pMMSrc + 3);
728 xmm4 = _mm_load_si128(pMMSrc + 4);
729 xmm5 = _mm_load_si128(pMMSrc + 5);
730 xmm6 = _mm_load_si128(pMMSrc + 6);
731 xmm7 = _mm_load_si128(pMMSrc + 7);
732 pMMSrc += 8;
733
734 _mm_stream_si128(pMMDest, xmm0);
735 _mm_stream_si128(pMMDest + 1, xmm1);
736 _mm_stream_si128(pMMDest + 2, xmm2);
737 _mm_stream_si128(pMMDest + 3, xmm3);
738 _mm_stream_si128(pMMDest + 4, xmm4);
739 _mm_stream_si128(pMMDest + 5, xmm5);
740 _mm_stream_si128(pMMDest + 6, xmm6);
741 _mm_stream_si128(pMMDest + 7, xmm7);
742 pMMDest += 8;
743 }
744
745 count &= TIERED_TAIL;
746 if (count != 0)
747 {
748 cnt = count >> INSTR_128_SHIFT;
749 for (UINT32 i = 0; i < cnt; i += 1)
750 {
751 xmm0 = _mm_load_si128(pMMSrc);
752 pMMSrc += 1;
753 _mm_stream_si128(pMMDest, xmm0);
754 pMMDest += 1;
755 }
756 }
757 }
758 }
759
760 // handle tail copy as a fallthrough
761 count &= TAIL_SIZE;
762 if (count != 0)
763 {
764 cnt = count >> DWORD_SHIFT;
765 DWORD* pDst = reinterpret_cast<DWORD*>(pMMDest);
766 const DWORD* pSrc = reinterpret_cast<const DWORD*>(pMMSrc);
767
768 for (UINT32 i = 0; i < cnt; i += 1)
769 {
770 *pDst = *pSrc;
771 pDst += 1;
772 pSrc += 1;
773 }
774
775 cnt = count & BYTE_TAIL;
776 BYTE* bDst = reinterpret_cast<BYTE*>(pDst);
777 const BYTE* bSrc = reinterpret_cast<const BYTE*>(pSrc);
778
779 for (UINT32 i = 0; i < cnt; i += 1)
780 {
781 *bDst = *bSrc;
782 bDst += 1;
783 bSrc += 1;
784 }
785 }
786 #else // #if defined ( _MSC_VER )
787 // Linux projects do not support standard types or memcpy_s
788 ::memcpy_s(dst, bytes, src, bytes);
789 #endif
790 }
791
792 /*****************************************************************************\
793 Inline Function:
794 ScalarSwapBytes
795
796 Description:
797 Helper function for MemCopySwapBytes
798 \*****************************************************************************/
ScalarSwapBytes(__m128i ** dst,const __m128i ** src,const size_t byteCount,const unsigned int swapbytes)799 inline void ScalarSwapBytes(
800 __m128i** dst,
801 const __m128i** src,
802 const size_t byteCount,
803 const unsigned int swapbytes)
804 {
805 switch (swapbytes)
806 {
807 case 2:
808 {
809 WORD* wDst = reinterpret_cast<WORD*>(*dst);
810 const WORD* wSrc = reinterpret_cast<const WORD*>(*src);
811
812 for (UINT32 i = 0; i < byteCount / 2; i += 1)
813 {
814 WORD tmp = *wSrc;
815 *wDst = (tmp >> 8) | (tmp << 8);
816 wDst += 1;
817 wSrc += 1;
818 }
819
820 *src = reinterpret_cast<const __m128i*>(wSrc);
821 *dst = reinterpret_cast<__m128i*>(wDst);
822 }
823 break;
824 case 4:
825 {
826 DWORD* dwDst = reinterpret_cast<DWORD*>(*dst);
827 const DWORD* dwSrc = reinterpret_cast<const DWORD*>(*src);
828
829 for (UINT32 i = 0; i < byteCount / 4; i += 1)
830 {
831 DWORD tmp = *dwSrc;
832 *dwDst = (tmp >> 24) | (tmp << 24) |
833 ((tmp & 0x0000FF00) << 8) |
834 ((tmp & 0x00FF0000) >> 8);
835 dwDst += 1;
836 dwSrc += 1;
837 }
838
839 *src = reinterpret_cast<const __m128i*>(dwSrc);
840 *dst = reinterpret_cast<__m128i*>(dwDst);
841 }
842 break;
843 default:
844 // should not occur
845 BYTE* bDst = reinterpret_cast<BYTE*>(*dst);
846 const BYTE* bSrc = reinterpret_cast<const BYTE*>(*src);
847
848 ::memcpy_s(bDst, byteCount, bSrc, byteCount);
849
850 *src = reinterpret_cast<const __m128i*>(bSrc + byteCount);
851 *dst = reinterpret_cast<__m128i*>(bDst + byteCount);
852 }
853 }
854
855 /*****************************************************************************\
856 Inline Function:
857 MemCopySwapBytes
858
859 Description:
860 Memory copy with swapped byte order, 2 and 4 byte elements only
861
862 Input:
863 dst - pointer to write-combined destination buffer
864 src - pointer to source buffer
865 bytes - number of bytes to copy
866 swapbytes - granularity of elements to swap
867 \*****************************************************************************/
MemCopySwapBytes(void * dst,const void * src,const size_t bytes,const unsigned int swapbytes)868 inline void MemCopySwapBytes(
869 void* dst,
870 const void* src,
871 const size_t bytes,
872 const unsigned int swapbytes)
873 {
874 const __m128i* pMMSrc = reinterpret_cast<const __m128i*>(src);
875 __m128i* pMMDest = reinterpret_cast<__m128i*>(dst);
876 size_t count = bytes;
877 size_t cnt = 0;
878 __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
879
880 // 2 byte shuffle
881 const __m128i wordMask = _mm_setr_epi8(
882 0x01, 0x00, 0x03, 0x02, 0x05, 0x04, 0x07, 0x06,
883 0x09, 0x08, 0x0b, 0x0a, 0x0d, 0x0c, 0x0f, 0x0e);
884
885 // 4 byte shuffle
886 const __m128i dwordMask = _mm_setr_epi8(
887 0x03, 0x02, 0x01, 0x00, 0x07, 0x06, 0x05, 0x04,
888 0x0b, 0x0a, 0x09, 0x08, 0x0f, 0x0e, 0x0d, 0x0c);
889
890 // SSE3 support required
891 CPU_INSTRUCTION_LEVEL cpuInstructionLevel = GetCpuInstructionLevel();
892 if (cpuInstructionLevel < CPU_INSTRUCTION_LEVEL_SSE3)
893 {
894 ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
895 return;
896 }
897
898 // only handle 2 and 4 bytes swapping
899 if (swapbytes != 2 && swapbytes != 4)
900 {
901 MemCopy(pMMDest, pMMSrc, count);
902 return;
903 }
904
905 // when size is < 16 rely, must use scalar swap
906 if (count < INSTR_WIDTH_128)
907 {
908 ScalarSwapBytes(&pMMDest, &pMMSrc, count, swapbytes);
909 }
910 else
911 {
912 const __m128i shuffleMask = (swapbytes == 2) ? wordMask : dwordMask;
913
914 // handle un-aligned tiered copy up to 2 cache lines
915 if (count < 2 * CACHE_LINE_SIZE)
916 {
917 cnt = count >> INSTR_128_SHIFT;
918 for (UINT32 i = 0; i < cnt; i += 1)
919 {
920 xmm0 = _mm_loadu_si128(pMMSrc);
921 pMMSrc += 1;
922 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
923 _mm_storeu_si128(pMMDest, xmm0);
924 pMMDest += 1;
925 }
926 }
927 // handle aligned copy for > 2 cache lines
928 else
929 {
930 // align destination to 16 if necessary
931 UINT32 align = (UINT32)((UINT_PTR)pMMDest & TAIL_SIZE);
932 if (align != 0)
933 {
934 align = INSTR_WIDTH_128 - align;
935 cnt = align >> DWORD_SHIFT;
936 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
937 cnt = align & BYTE_TAIL;
938
939 // only words should remain, not bytes
940 if (cnt > 0)
941 {
942 ASSERT(cnt % 2 == 0);
943 ASSERT(swapbytes == 2);
944 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
945 }
946 }
947
948 count -= align; // take off the alignment from size
949
950 // check source alignment
951 if ((UINT_PTR)pMMSrc & TAIL_SIZE)
952 {
953 // copy un-aligned by tiers
954 cnt = count >> DUAL_CACHE_SHIFT;
955 for (UINT32 i = 0; i < cnt; i += 1)
956 {
957 xmm0 = _mm_loadu_si128(pMMSrc);
958 xmm1 = _mm_loadu_si128(pMMSrc + 1);
959 xmm2 = _mm_loadu_si128(pMMSrc + 2);
960 xmm3 = _mm_loadu_si128(pMMSrc + 3);
961 xmm4 = _mm_loadu_si128(pMMSrc + 4);
962 xmm5 = _mm_loadu_si128(pMMSrc + 5);
963 xmm6 = _mm_loadu_si128(pMMSrc + 6);
964 xmm7 = _mm_loadu_si128(pMMSrc + 7);
965 pMMSrc += 8;
966
967 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
968 xmm1 = _mm_shuffle_epi8(xmm1, shuffleMask);
969 xmm2 = _mm_shuffle_epi8(xmm2, shuffleMask);
970 xmm3 = _mm_shuffle_epi8(xmm3, shuffleMask);
971 xmm4 = _mm_shuffle_epi8(xmm4, shuffleMask);
972 xmm5 = _mm_shuffle_epi8(xmm5, shuffleMask);
973 xmm6 = _mm_shuffle_epi8(xmm6, shuffleMask);
974 xmm7 = _mm_shuffle_epi8(xmm7, shuffleMask);
975
976 _mm_store_si128(pMMDest, xmm0);
977 _mm_store_si128(pMMDest + 1, xmm1);
978 _mm_store_si128(pMMDest + 2, xmm2);
979 _mm_store_si128(pMMDest + 3, xmm3);
980 _mm_store_si128(pMMDest + 4, xmm4);
981 _mm_store_si128(pMMDest + 5, xmm5);
982 _mm_store_si128(pMMDest + 6, xmm6);
983 _mm_store_si128(pMMDest + 7, xmm7);
984 pMMDest += 8;
985 }
986
987 count &= TIERED_TAIL;
988 if (count != 0)
989 {
990 cnt = count >> INSTR_128_SHIFT;
991 for (UINT32 i = 0; i < cnt; i += 1)
992 {
993 xmm0 = _mm_loadu_si128(pMMSrc);
994 pMMSrc += 1;
995 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
996 _mm_store_si128(pMMDest, xmm0);
997 pMMDest += 1;
998 }
999 }
1000 }
1001 else
1002 {
1003 // copy aligned by tiers
1004 cnt = count >> DUAL_CACHE_SHIFT;
1005 for (UINT32 i = 0; i < cnt; i += 1)
1006 {
1007 xmm0 = _mm_load_si128(pMMSrc);
1008 xmm1 = _mm_load_si128(pMMSrc + 1);
1009 xmm2 = _mm_load_si128(pMMSrc + 2);
1010 xmm3 = _mm_load_si128(pMMSrc + 3);
1011 xmm4 = _mm_load_si128(pMMSrc + 4);
1012 xmm5 = _mm_load_si128(pMMSrc + 5);
1013 xmm6 = _mm_load_si128(pMMSrc + 6);
1014 xmm7 = _mm_load_si128(pMMSrc + 7);
1015 pMMSrc += 8;
1016
1017 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
1018 xmm1 = _mm_shuffle_epi8(xmm1, shuffleMask);
1019 xmm2 = _mm_shuffle_epi8(xmm2, shuffleMask);
1020 xmm3 = _mm_shuffle_epi8(xmm3, shuffleMask);
1021 xmm4 = _mm_shuffle_epi8(xmm4, shuffleMask);
1022 xmm5 = _mm_shuffle_epi8(xmm5, shuffleMask);
1023 xmm6 = _mm_shuffle_epi8(xmm6, shuffleMask);
1024 xmm7 = _mm_shuffle_epi8(xmm7, shuffleMask);
1025
1026 _mm_store_si128(pMMDest, xmm0);
1027 _mm_store_si128(pMMDest + 1, xmm1);
1028 _mm_store_si128(pMMDest + 2, xmm2);
1029 _mm_store_si128(pMMDest + 3, xmm3);
1030 _mm_store_si128(pMMDest + 4, xmm4);
1031 _mm_store_si128(pMMDest + 5, xmm5);
1032 _mm_store_si128(pMMDest + 6, xmm6);
1033 _mm_store_si128(pMMDest + 7, xmm7);
1034 pMMDest += 8;
1035 }
1036
1037 count &= TIERED_TAIL;
1038 if (count != 0)
1039 {
1040 cnt = count >> INSTR_128_SHIFT;
1041 for (UINT32 i = 0; i < cnt; i += 1)
1042 {
1043 xmm0 = _mm_load_si128(pMMSrc);
1044 pMMSrc += 1;
1045 xmm0 = _mm_shuffle_epi8(xmm0, shuffleMask);
1046 _mm_store_si128(pMMDest, xmm0);
1047 pMMDest += 1;
1048 }
1049 }
1050 }
1051 }
1052
1053 // handle tail copy as a fallthrough
1054 count &= TAIL_SIZE;
1055 if (count != 0)
1056 {
1057 cnt = count >> DWORD_SHIFT;
1058 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt * sizeof(DWORD), swapbytes);
1059 cnt = count & BYTE_TAIL;
1060
1061 // only words should remain, not bytes
1062 if (cnt > 0)
1063 {
1064 ASSERT(cnt % 2 == 0);
1065 ASSERT(swapbytes == 2);
1066 ScalarSwapBytes(&pMMDest, &pMMSrc, cnt, swapbytes);
1067 }
1068 }
1069 }
1070 }
1071
1072 /*****************************************************************************\
1073 Inline Function:
1074 SafeMemSet
1075
1076 Description:
1077 Exception Handler Memory Set function
1078 \*****************************************************************************/
SafeMemSet(void * dst,const int data,const size_t bytes)1079 inline void SafeMemSet( void* dst, const int data, const size_t bytes )
1080 {
1081 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1082 __try
1083 #endif
1084 {
1085 ::memset( dst, data, bytes );
1086 }
1087 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1088 // catch exceptions here so they are easily debugged
1089 __except(1)
1090 {
1091 ASSERT(0);
1092 }
1093 #endif
1094 }
1095
1096 /*****************************************************************************\
1097 Inline Function:
1098 SafeMemCompare
1099
1100 Description:
1101 Exception Handler Memory Compare function
1102 \*****************************************************************************/
SafeMemCompare(const void * dst,const void * src,const size_t bytes)1103 inline int SafeMemCompare( const void* dst, const void* src, const size_t bytes )
1104 {
1105 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1106 __try
1107 #endif
1108 {
1109 return ::memcmp( dst, src, bytes );
1110 }
1111 #if defined(_DEBUG) && defined(ISTDLIB_KMD)
1112 // catch exceptions here so they are easily debugged
1113 __except(1)
1114 {
1115 ASSERT(0);
1116 return -1;
1117 }
1118 #endif
1119 }
1120
1121 /*****************************************************************************\
1122 Inline Function:
1123 SafeMemMove
1124
1125 Description:
1126 copies "bytes" of data from src to dst.
1127 dst is not corrupted if src and dst blocks of data overlap.
1128
1129 Input:
1130 dst - pointer to destination buffer
1131 src - pointer to source buffer
1132 bytes - number of bytes to copy
1133 \*****************************************************************************/
SafeMemMove(void * dst,const void * src,const size_t bytes)1134 inline void SafeMemMove( void *dst, const void *src, const size_t bytes )
1135 {
1136 if( dst!=src )
1137 {
1138 if( src>dst && bytes )
1139 {
1140 size_t t = 0;
1141 do
1142 {
1143 static_cast< unsigned char* >( dst )[t] = static_cast< const unsigned char* >( src )[t];
1144 }
1145 while( ++t != bytes );
1146 }
1147 else
1148 {
1149 size_t t = bytes-1;
1150 do
1151 {
1152 static_cast< unsigned char* >( dst )[t] = static_cast< const unsigned char* >( src )[t];
1153 }
1154 while( t-- != 0 );
1155 }
1156 }
1157 }
1158
1159 /*****************************************************************************\
1160 MACROS:
1161 EMIT_R_MR
1162 Example: movntdqa xmm1, xmmword ptr [eax]
1163
1164 EMIT_R_MR_OFFSET
1165 Example: movntdqa xmm1, xmmword ptr [eax + 0x10]
1166
1167 Description:
1168 Used to encode SSE4.1 instructions with parametrs
1169 \*****************************************************************************/
1170 #define EMIT_R_MR(OPCODE, X, Y ) \
1171 OPCODE \
1172 __asm _emit (0x00 + X*8 + Y)
1173
1174 #define EMIT_R_MR_OFFSET(OPCODE, X, Y, OFFSET) \
1175 OPCODE \
1176 __asm _emit (0x80 + X*8 + Y) \
1177 __asm _emit (OFFSET&0xFF) \
1178 __asm _emit ((OFFSET>>8)&0xFF) \
1179 __asm _emit ((OFFSET>>16)&0xFF) \
1180 __asm _emit ((OFFSET>>24)&0xFF)
1181
1182 /*****************************************************************************\
1183 MACROS:
1184 REG_XXX
1185
1186 Description:
1187 Define CPU General Purpose and XMM Register Indices
1188 These MACROS are to be replaced with instrinics available with .NET 2008
1189 \*****************************************************************************/
1190 #if defined( _MSC_VER )
1191 #define REG_EAX 0x00
1192 #define REG_ECX 0x01
1193 #define REG_EDX 0x02
1194 #define REG_EBX 0x03
1195 #define REG_ESP 0x04
1196 #define REG_EBP 0x05
1197 #define REG_ESI 0x06
1198 #define REG_EDI 0x07
1199 #define REG_XMM0 0x00
1200 #define REG_XMM1 0x01
1201 #define REG_XMM2 0x02
1202 #define REG_XMM3 0x03
1203 #define REG_XMM4 0x04
1204 #define REG_XMM5 0x05
1205 #define REG_XMM6 0x06
1206 #define REG_XMM7 0x07
1207 #endif //#if defined( _MSC_VER )
1208
1209 /*****************************************************************************\
1210 MACROS:
1211 MOVNTDQA_OP
1212 MOVNTDQA_R_MR
1213 MOVNTDQA_R_MRB
1214
1215 Description:
1216 Used to emit SSE4_1 movntdqa (streaming load) instructions
1217 SRC - XMM Register, destination data is to be stored
1218 DST - General Purpose Register containing source address
1219 OFFSET - Offset to be added to the source address
1220 \*****************************************************************************/
1221 #define MOVNTDQA_OP \
1222 _asm _emit 0x66 \
1223 _asm _emit 0x0F \
1224 _asm _emit 0x38 \
1225 _asm _emit 0x2A
1226
1227 #define MOVNTDQA_R_MR(DST, SRC) \
1228 EMIT_R_MR(MOVNTDQA_OP, DST, SRC)
1229
1230 #define MOVNTDQA_R_MR_OFFSET(DST, SRC, OFFSET) \
1231 EMIT_R_MR_OFFSET(MOVNTDQA_OP, DST, SRC, OFFSET)
1232
1233 /*****************************************************************************\
1234 Inline Function:
1235 FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa
1236
1237 Description: Fast copy from USWC memory to cacheable system memory
1238
1239 Input:
1240 dst - 16-byte aligned pointer to (cacheable) destination buffer
1241 src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
1242 \*****************************************************************************/
1243 #if defined( _MSC_VER ) && !defined (_WIN64)
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa(void * dst,const void * src)1244 __forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( void* dst, const void* src )
1245 {
1246
1247 __asm
1248 {
1249 ;Store the orginal source start address
1250 mov edx, src
1251
1252 ;Store the dest address
1253 mov ecx, dst
1254
1255 align 16
1256
1257 ; Load data from source buffer
1258 ; Streaming loads from the same cache line should be grouped together
1259 ; and not be interleaved with: a) Writes or non-streaming loads or
1260 ; b) Streaming loads from other cache lines (strided accesses)
1261
1262 ; movntdqa xmm0, xmmword ptr [edx]
1263 MOVNTDQA_R_MR(REG_XMM0, REG_EDX)
1264
1265 ; movntdqa xmm1, xmmword ptr [edx+16]
1266 MOVNTDQA_R_MR_OFFSET(REG_XMM1, REG_EDX, 16)
1267
1268 ; movntdqa xmm2, xmmword ptr [edx+32]
1269 MOVNTDQA_R_MR_OFFSET(REG_XMM2, REG_EDX, 32)
1270
1271 ; movntdqa xmm3, xmmword ptr [edx+48]
1272 MOVNTDQA_R_MR_OFFSET(REG_XMM3, REG_EDX, 48)
1273
1274 ; Save data in destination buffer.
1275 movdqa xmmword ptr [ecx], xmm0
1276 movdqa xmmword ptr [ecx+16], xmm1
1277 movdqa xmmword ptr [ecx+32], xmm2
1278 movdqa xmmword ptr [ecx+48], xmm3
1279 }
1280
1281 } // FastMemCopy_SSE4_1_movntdqa_movdqa()
1282 #endif //#if defined( _MSC_VER ) && !defined (_WIN64)
1283
1284 /*****************************************************************************\
1285 Inline Function:
1286 FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu
1287
1288 Description: Fast copy from USWC memory (DHWORD in size) to cacheable system memory
1289
1290 Input:
1291 dst - 16-byte (unaligned) pointer to (cacheable) destination buffer
1292 src - 16-byte(req)/64-byte(optimal) aligned pointer to (USWC) source buffer
1293 \*****************************************************************************/
1294 #if defined ( _MSC_VER ) && !defined(_WIN64)
FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void * dst,const void * src)1295 __forceinline void __fastcall FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu(void* dst, const void* src )
1296 {
1297 __asm
1298 {
1299 ;Store the orginal source start address
1300 mov edx, src
1301
1302 ;Store the dest address
1303 mov ecx, dst
1304
1305 align 16
1306
1307 ; Load data from source buffer
1308 ; Streaming loads from the same cache line should be grouped together
1309 ; and not be interleaved with: a) Writes or non-streaming loads or
1310 ; b) Streaming loads from other cache lines (strided accesses)
1311
1312 ; movntdqa xmm0, xmmword ptr [edx]
1313 MOVNTDQA_R_MR(REG_XMM0, REG_EDX)
1314
1315 ; movntdqa xmm1, xmmword ptr [edx+16]
1316 MOVNTDQA_R_MR_OFFSET(REG_XMM1, REG_EDX, 16)
1317
1318 ; movntdqa xmm2, xmmword ptr [edx+32]
1319 MOVNTDQA_R_MR_OFFSET(REG_XMM2, REG_EDX, 32)
1320
1321 ; movntdqa xmm3, xmmword ptr [edx+48]
1322 MOVNTDQA_R_MR_OFFSET(REG_XMM3, REG_EDX, 48)
1323
1324 ; Copy data in destination buffer.
1325 movdqu xmmword ptr [ecx], xmm0
1326 movdqu xmmword ptr [ecx+16], xmm1
1327 movdqu xmmword ptr [ecx+32], xmm2
1328 movdqu xmmword ptr [ecx+48], xmm3
1329 }
1330 } // FastMemCopy_SSE4_1_movntdqa_movdqu()
1331 #endif // #if defined( _MSC_VER ) && !defined (_WIN64)
1332
1333
FastMemCopyFromWC(void * dst,const void * src,const size_t bytes,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)1334 inline void FastMemCopyFromWC( void* dst, const void* src, const size_t bytes, CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
1335 {
1336 #if defined( _MSC_VER ) && (!defined (_WIN64) || defined ( _In_ ) ) || defined (__GNUC__)
1337 if( cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
1338 {
1339 // Cache pointers to memory
1340 BYTE* p_dst = (BYTE*)dst;
1341 BYTE* p_src = (BYTE*)src;
1342
1343 size_t count = bytes;
1344
1345 if( count >= sizeof(DHWORD) )
1346 {
1347 //Streaming Load must be 16-byte aligned but should
1348 //be 64-byte aligned for optimal performance
1349 const size_t doubleHexWordAlignBytes =
1350 GetAlignmentOffset( p_src, sizeof(DHWORD) );
1351
1352 // Copy portion of the source memory that is not aligned
1353 if( doubleHexWordAlignBytes )
1354 {
1355 MemCopy( p_dst, p_src, doubleHexWordAlignBytes );
1356
1357 p_dst += doubleHexWordAlignBytes;
1358 p_src += doubleHexWordAlignBytes;
1359 count -= doubleHexWordAlignBytes;
1360 }
1361
1362 ASSERT( IsAligned( p_src, sizeof(DHWORD) ) == true );
1363
1364 // Get the number of bytes to be copied (rounded down to nearets DHWORD)
1365 const size_t DoubleHexWordsToCopy = count / sizeof(DHWORD);
1366
1367 if( DoubleHexWordsToCopy )
1368 {
1369 // Determine if the destination address is aligned
1370 const bool isDstDoubleQuadWordAligned =
1371 IsAligned( p_dst, sizeof(DQWORD) );
1372
1373 #if defined(_WIN64) || defined(__GNUC__)
1374 __m128i* pMMSrc = (__m128i*)(p_src);
1375 __m128i* pMMDest = reinterpret_cast<__m128i*>(p_dst);
1376 __m128i xmm0, xmm1, xmm2, xmm3;
1377 #endif
1378
1379 if( isDstDoubleQuadWordAligned )
1380 {
1381 #if defined(__GNUC__)
1382 // Sync the WC memory data before issuing the MOVNTDQA instruction.
1383 _mm_mfence();
1384 #endif
1385 for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
1386 {
1387
1388 #if !defined(_WIN64) && !defined(__GNUC__)
1389 FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqa( p_dst, p_src );
1390 #else
1391 xmm0 = _mm_stream_load_si128(pMMSrc);
1392 xmm1 = _mm_stream_load_si128(pMMSrc + 1);
1393 xmm2 = _mm_stream_load_si128(pMMSrc + 2);
1394 xmm3 = _mm_stream_load_si128(pMMSrc + 3);
1395 pMMSrc += 4;
1396
1397 _mm_store_si128(pMMDest, xmm0);
1398 _mm_store_si128(pMMDest + 1, xmm1);
1399 _mm_store_si128(pMMDest + 2, xmm2);
1400 _mm_store_si128(pMMDest + 3, xmm3);
1401 pMMDest += 4;
1402 #endif
1403
1404 p_dst += sizeof(DHWORD);
1405 p_src += sizeof(DHWORD);
1406 count -= sizeof(DHWORD);
1407 }
1408 }
1409 else
1410 {
1411 #if defined(__GNUC__)
1412 // Sync the WC memory data before issuing the MOVNTDQA instruction.
1413 _mm_mfence();
1414 #endif
1415 for( size_t i=0; i<DoubleHexWordsToCopy; i++ )
1416 {
1417
1418 #if !defined(_WIN64) && !defined(__GNUC__)
1419 FastBlockCopyFromUSWC_SSE4_1_movntdqa_movdqu( p_dst, p_src );
1420 #else
1421 xmm0 = _mm_stream_load_si128(pMMSrc);
1422 xmm1 = _mm_stream_load_si128(pMMSrc + 1);
1423 xmm2 = _mm_stream_load_si128(pMMSrc + 2);
1424 xmm3 = _mm_stream_load_si128(pMMSrc + 3);
1425 pMMSrc += 4;
1426
1427 _mm_storeu_si128(pMMDest, xmm0);
1428 _mm_storeu_si128(pMMDest + 1, xmm1);
1429 _mm_storeu_si128(pMMDest + 2, xmm2);
1430 _mm_storeu_si128(pMMDest + 3, xmm3);
1431 pMMDest += 4;
1432 #endif
1433
1434 p_dst += sizeof(DHWORD);
1435 p_src += sizeof(DHWORD);
1436 count -= sizeof(DHWORD);
1437 }
1438 }
1439 }
1440 }
1441
1442 // Copy remaining BYTE(s)
1443 if( count )
1444 {
1445 MemCopy( p_dst, p_src, count );
1446 }
1447 }
1448 else
1449 #endif //!defined ( _WIN64 ) || defined ( _In_ )
1450 {
1451 MemCopy( dst, src, bytes );
1452 }
1453 }
1454
1455 /*****************************************************************************\
1456 Inline Function:
1457 FastCpuBlt
1458
1459 Description:
1460 Intel C++ Compiler CPU Blit function
1461
1462 Parameters:
1463 BYTE* dst - destination pointer
1464 const DWORD dstPitch - pitch to increment destination pointer per count
1465 BYTE* src - source pointer
1466 const DWORD srcPitch - pitch to increment source pointer per count
1467 const DWORD stride - stride of data to copy per count, in bytes
1468 DWORD count - number of iterations to copy data
1469
1470 \*****************************************************************************/
FastCpuBlt(BYTE * dst,const DWORD dstPitch,BYTE * src,const DWORD srcPitch,const DWORD stride,DWORD count)1471 inline void FastCpuBlt(
1472 BYTE* dst,
1473 const DWORD dstPitch,
1474 BYTE* src,
1475 const DWORD srcPitch,
1476 const DWORD stride,
1477 DWORD count )
1478 {
1479 do
1480 {
1481 MemCopy( dst, src, stride );
1482
1483 dst += dstPitch;
1484 src += srcPitch;
1485 }
1486 while( --count > 0 );
1487 }
1488
1489 /*****************************************************************************\
1490 Inline Function:
1491 FastCpuSet
1492
1493 Description:
1494 Intel C++ Compiler CPU Blit function
1495
1496 Parameters:
1497 BYTE* dst - destination pointer
1498 const DWORD dstPitch - pitch to increment destination pointer per count
1499 BYTE* src - source pointer
1500 const DWORD srcPitch - pitch to increment source pointer per count
1501 const DWORD stride - stride of data to copy per count, in bytes
1502 DWORD count - number of iterations to copy data
1503
1504 \*****************************************************************************/
FastCpuSet(BYTE * dst,const DWORD dstPitch,const DWORD value,const DWORD stride,DWORD count)1505 inline void FastCpuSet(
1506 BYTE* dst,
1507 const DWORD dstPitch,
1508 const DWORD value,
1509 const DWORD stride,
1510 DWORD count )
1511 {
1512 do
1513 {
1514 SafeMemSet( dst, value, stride );
1515
1516 dst += dstPitch;
1517 }
1518 while( --count > 0 );
1519 }
1520
1521 /*****************************************************************************\
1522 Inline Function:
1523 FastCpuBltFromUSWC
1524
1525 Description:
1526 Intel C++ Compiler CPU Blit function from non-temporal to temporal memory
1527 This function is optimized using SSE4 instructions which use accelerated write-combined
1528 loads that bypass the cache.
1529
1530 Parameters:
1531 BYTE* dst - destination pointer (temporal)
1532 const DWORD dstPitch - pitch to increment destination pointer per count
1533 BYTE* src - source pointer (non-temporal)
1534 const DWORD srcPitch - pitch to increment source pointer per count
1535 const DWORD stride - stride of data to copy per count, in bytes
1536 DWORD count - number of iterations to copy data
1537 CPU_INSTRUCTION_LEVEL level - cpu instruction level (SSE support level)
1538
1539 \*****************************************************************************/
1540 #if defined ( _MSC_VER )
FastCpuBltFromUSWC(BYTE * dst,const DWORD dstPitch,BYTE * src,const DWORD srcPitch,const DWORD stride,DWORD count,CPU_INSTRUCTION_LEVEL level)1541 inline void FastCpuBltFromUSWC(
1542 BYTE* dst,
1543 const DWORD dstPitch,
1544 BYTE* src,
1545 const DWORD srcPitch,
1546 const DWORD stride,
1547 DWORD count,
1548 CPU_INSTRUCTION_LEVEL level)
1549 {
1550 #ifndef _WIN64
1551
1552 //back up the XMM registers just in case
1553 __declspec( align(16) ) BYTE backUpRegisters[16*4];
1554
1555 void *tempPtr = (void *) backUpRegisters;
1556
1557 __asm mov ecx, tempPtr
1558 __asm movdqa xmmword ptr [ecx + 16*0], xmm0
1559 __asm movdqa xmmword ptr [ecx + 16*1], xmm1
1560 __asm movdqa xmmword ptr [ecx + 16*2], xmm2
1561 __asm movdqa xmmword ptr [ecx + 16*3], xmm3
1562
1563 #endif //_WIN64
1564 do
1565 {
1566 iSTD::FastMemCopyFromWC( dst, src, stride, level );
1567
1568 dst += dstPitch;
1569 src += srcPitch;
1570 }
1571 while( --count > 0 );
1572 #ifndef _WIN64
1573 #if defined ( _MSC_VER )
1574 __asm mov ecx, tempPtr
1575 __asm movdqa xmm0, xmmword ptr [ecx + 16*0]
1576 __asm movdqa xmm1, xmmword ptr [ecx + 16*1]
1577 __asm movdqa xmm2, xmmword ptr [ecx + 16*2]
1578 __asm movdqa xmm3, xmmword ptr [ecx + 16*3]
1579 #endif
1580 #endif //_WIN64
1581 }
1582 #endif
1583
1584
1585 /*****************************************************************************\
1586 Inline Function:
1587 FindWordBufferMinMax
1588
1589 Description:
1590 Finds the min and max unsigned 16-bit values in the buffer
1591
1592 Input:
1593 WORD* pBuffer - pointer to 16-bit buffer
1594 const DWORD bytes - size of buffer in bytes
1595
1596 Output:
1597 WORD &min - minimum 16-bit value
1598 WORD &max - maximum 16-bit value
1599
1600 \*****************************************************************************/
FindWordBufferMinMax(WORD * pBuffer,const DWORD bytes,WORD & min,WORD & max)1601 inline void FindWordBufferMinMax(
1602 WORD* pBuffer,
1603 const DWORD bytes,
1604 WORD &min,
1605 WORD &max )
1606 {
1607 PrefetchBuffer( (BYTE*)pBuffer, bytes );
1608
1609 WORD wValue = 0;
1610 WORD wMinValue = 0xffff;
1611 WORD wMaxValue = 0x0000;
1612
1613 size_t count = bytes / sizeof(WORD);
1614 size_t i = 0;
1615
1616 if( IsAligned( pBuffer, sizeof(WORD) ) )
1617 {
1618 const size_t DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
1619 const size_t WordsPerPrefetch = sizeof(PREFETCH) / sizeof(WORD);
1620 const size_t WordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(WORD);
1621
1622 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
1623 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1624
1625 // Find min/max per cacheline of values
1626 if( count >= WordsPerDoubleQuadWord )
1627 {
1628 const size_t doubleQuadwordAlignWords =
1629 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
1630
1631 // If pBuffer is not double-quadword aligned then process
1632 // until aligned
1633 if( doubleQuadwordAlignWords )
1634 {
1635 for( i = 0; i < doubleQuadwordAlignWords; i++ )
1636 {
1637 wValue = *pBuffer++;
1638
1639 wMinValue = Min( wMinValue, wValue );
1640 wMaxValue = Max( wMaxValue, wValue );
1641 }
1642
1643 count -= doubleQuadwordAlignWords;
1644 }
1645
1646 // Find min/max per cacheline of values
1647 if( count >= WordsPerDoubleQuadWord )
1648 {
1649 __m128i mValue128i;
1650
1651 // Need to convert unsigned values to signed values
1652 // since min/max is signed op
1653 __m128i mSignedScale128i = _mm_set1_epi16((WORD)0x8000);
1654
1655 // Signed min/max initialization
1656 __m128i mMinValue128i = _mm_set1_epi16(wMinValue-(WORD)0x8000);
1657 __m128i mMaxValue128i = _mm_set1_epi16(wMaxValue-(WORD)0x8000);
1658
1659 while( count >= WordsPerPrefetch )
1660 {
1661 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1662
1663 // Process cacheline values per pass
1664 count -= WordsPerPrefetch;
1665
1666 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
1667 {
1668 // Get double-quadword values
1669 mValue128i = *(__m128i*)pBuffer;
1670 pBuffer += WordsPerDoubleQuadWord;
1671
1672 // Make values signed
1673 mValue128i = _mm_sub_epi16( mValue128i,
1674 mSignedScale128i );
1675
1676 // Determine parallel min/max
1677 mMinValue128i = _mm_min_epi16( mMinValue128i,
1678 mValue128i );
1679 mMaxValue128i = _mm_max_epi16( mMaxValue128i,
1680 mValue128i );
1681 }
1682 }
1683
1684 // Process double-quadword values per pass for remainder
1685 while( count >= WordsPerDoubleQuadWord )
1686 {
1687 // Process double-quadword values per pass
1688 count -= WordsPerDoubleQuadWord;
1689
1690 // Get double-quadword values
1691 mValue128i = *(__m128i*)pBuffer;
1692 pBuffer += WordsPerDoubleQuadWord;
1693
1694 // Make values signed
1695 mValue128i = _mm_sub_epi16( mValue128i,
1696 mSignedScale128i );
1697
1698 // Determine parallel min/max
1699 mMinValue128i = _mm_min_epi16( mMinValue128i,
1700 mValue128i );
1701 mMaxValue128i = _mm_max_epi16( mMaxValue128i,
1702 mValue128i );
1703 }
1704
1705 // Determine wMinValue
1706
1707 // Make values unsigned
1708 mMinValue128i = _mm_add_epi16( mMinValue128i,
1709 mSignedScale128i );
1710
1711 // Extract each value in double-quadword to find minimum
1712 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
1713 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
1714 wMinValue = Min( wMinValue, wValue );
1715 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
1716 wMinValue = Min( wMinValue, wValue );
1717 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
1718 wMinValue = Min( wMinValue, wValue );
1719 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
1720 wMinValue = Min( wMinValue, wValue );
1721 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
1722 wMinValue = Min( wMinValue, wValue );
1723 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
1724 wMinValue = Min( wMinValue, wValue );
1725 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
1726 wMinValue = Min( wMinValue, wValue );
1727 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
1728 wMinValue = Min( wMinValue, wValue );
1729
1730 // Determine wMaxValue
1731
1732 // Make values unsigned
1733 mMaxValue128i = _mm_add_epi16( mMaxValue128i,
1734 mSignedScale128i );
1735
1736 // Extract each value in double-quadword to find maximum
1737 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
1738 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
1739 wMaxValue = Max( wMaxValue, wValue );
1740 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
1741 wMaxValue = Max( wMaxValue, wValue );
1742 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
1743 wMaxValue = Max( wMaxValue, wValue );
1744 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
1745 wMaxValue = Max( wMaxValue, wValue );
1746 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
1747 wMaxValue = Max( wMaxValue, wValue );
1748 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
1749 wMaxValue = Max( wMaxValue, wValue );
1750 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
1751 wMaxValue = Max( wMaxValue, wValue );
1752 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
1753 wMaxValue = Max( wMaxValue, wValue );
1754
1755 } // if( count >= WordsPerDoubleQuadWord )
1756 } // if( count >= WordsPerDoubleQuadWord )
1757 }
1758 #ifndef _WIN64
1759 else // if( IsAligned( pBuffer, sizeof(WORD) ) )
1760 {
1761 const size_t QuadWordsPerCacheline = sizeof(CACHELINE) / sizeof(QWORD);
1762 const size_t WordsPerCacheline = sizeof(CACHELINE) / sizeof(WORD);
1763 const size_t WordsPerQuadWord = sizeof(QWORD) / sizeof(WORD);
1764
1765 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
1766 Prefetch( (BYTE*)pBuffer + 2 * sizeof(CACHELINE) );
1767
1768 if( count >= WordsPerQuadWord )
1769 {
1770 __m64 mValue64;
1771
1772 // Need to convert unsigned values to signed values
1773 // since min/max is signed op
1774 __m64 mSignedScale64 = _mm_set1_pi16((WORD)0x8000);
1775
1776 // Signed min/max initialization
1777 __m64 mMinValue64 = _mm_set1_pi16(wMinValue-(WORD)0x8000);
1778 __m64 mMaxValue64 = _mm_set1_pi16(wMaxValue-(WORD)0x8000);
1779
1780 // Find min/max per cacheline of values
1781 while( count >= WordsPerCacheline )
1782 {
1783 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
1784
1785 // Process cacheline values per pass
1786 count -= WordsPerCacheline;
1787
1788 for( i = 0; i < QuadWordsPerCacheline; i++ )
1789 {
1790 // Get quadword values
1791 mValue64 = *(__m64*)pBuffer;
1792 pBuffer += WordsPerQuadWord;
1793
1794 // Make values signed
1795 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
1796
1797 // Determine parallel min/max
1798 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
1799 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
1800 }
1801 }
1802
1803 // Process quadword values per pass for remainder
1804 while( count >= WordsPerQuadWord )
1805 {
1806 // Process quadword values per pass
1807 count -= WordsPerQuadWord;
1808
1809 // Get quadword values
1810 mValue64 = *(__m64*)pBuffer;
1811 pBuffer += WordsPerQuadWord;
1812
1813 // Make values signed
1814 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
1815
1816 // Determine parallel min/max
1817 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
1818 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
1819 }
1820
1821 // Determine wMinValue
1822
1823 // Make values unsigned
1824 mMinValue64 = _mm_add_pi16( mMinValue64, mSignedScale64 );
1825
1826 // Extract each value in quadword to find minimum
1827 // for( i = 0; i < WordsPerQuadWord; i++ )
1828 wValue = (WORD)_mm_extract_pi16( mMinValue64, 0 );
1829 wMinValue = Min( wMinValue, wValue );
1830 wValue = (WORD)_mm_extract_pi16( mMinValue64, 1 );
1831 wMinValue = Min( wMinValue, wValue );
1832 wValue = (WORD)_mm_extract_pi16( mMinValue64, 2 );
1833 wMinValue = Min( wMinValue, wValue );
1834 wValue = (WORD)_mm_extract_pi16( mMinValue64, 3 );
1835 wMinValue = Min( wMinValue, wValue );
1836
1837 // Determine wMaxValue
1838
1839 // Make values unsigned
1840 mMaxValue64 = _mm_add_pi16( mMaxValue64, mSignedScale64 );
1841
1842 // Extract each value in quadword to find maximum
1843 // for( i = 0; i < WordsPerQuadWord; i++ )
1844 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 0 );
1845 wMaxValue = Max( wMaxValue, wValue );
1846 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 1 );
1847 wMaxValue = Max( wMaxValue, wValue );
1848 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 2 );
1849 wMaxValue = Max( wMaxValue, wValue );
1850 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 3 );
1851 wMaxValue = Max( wMaxValue, wValue );
1852
1853 _mm_empty();
1854
1855 } // if( count >= WordsPerQuadWord )
1856 }
1857 #endif
1858
1859 // Find min/max per value
1860 while( count > 0 )
1861 {
1862 count -= 1;
1863
1864 wValue = *pBuffer++;
1865
1866 wMinValue = Min( wMinValue, wValue );
1867 wMaxValue = Max( wMaxValue, wValue );
1868 }
1869
1870 min = wMinValue;
1871 max = wMaxValue;
1872 }
1873
1874
1875 /*****************************************************************************\
1876 Inline Function:
1877 FindWordBufferMinMaxRestart
1878
1879 Description:
1880 Finds the min and max unsigned 32-bit values in the buffer
1881 Excludes a restart value from min or max values
1882
1883 Input:
1884 WORD* pBuffer - pointer to 32-bit buffer
1885 const DWORD bytes - size of buffer in bytes
1886 const WORD restart - restart index to ignore
1887 cpuInstructionLevel - indicates if SSE_4.1 is available
1888
1889 Output:
1890 WORD &min - minimum 32-bit value
1891 WORD &max - maximum 32-bit value
1892
1893 \*****************************************************************************/
FindWordBufferMinMaxRestart(WORD * pBuffer,const DWORD bytes,const WORD restart,WORD & min,WORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)1894 inline void FindWordBufferMinMaxRestart(
1895 WORD* pBuffer,
1896 const DWORD bytes,
1897 const WORD restart,
1898 WORD &min,
1899 WORD &max,
1900 CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
1901 {
1902 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
1903
1904 WORD wValue = 0;
1905 WORD wMinValue = 0xffff;
1906 WORD wMaxValue = 0x0000;
1907
1908 size_t count = bytes / sizeof(WORD);
1909
1910 #ifdef USE_SSE4_1
1911
1912 size_t i = 0;
1913
1914 if( IsAligned( pBuffer, sizeof(WORD) ) &&
1915 cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
1916 {
1917 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
1918 const DWORD WordsPerPrefetch = sizeof(PREFETCH) / sizeof(WORD);
1919 const DWORD WordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(WORD);
1920
1921 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
1922 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1923
1924 // Find min/max per cacheline of values
1925 if( count >= WordsPerDoubleQuadWord )
1926 {
1927 const size_t doubleQuadwordAlignWords =
1928 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
1929
1930 // If pBuffer is not double-quadword aligned then process
1931 // until aligned
1932 if( doubleQuadwordAlignWords )
1933 {
1934 for( i = 0; i < doubleQuadwordAlignWords; i++ )
1935 {
1936 wValue = *pBuffer++;
1937
1938 if (wValue == restart) {
1939 continue;
1940 }
1941 wMinValue = Min( wMinValue, wValue );
1942 wMaxValue = Max( wMaxValue, wValue );
1943 }
1944
1945 count -= doubleQuadwordAlignWords;
1946 }
1947
1948 // Find min/max per cacheline of values
1949 if( count >= WordsPerDoubleQuadWord )
1950 {
1951 __m128i mInput, mRestarts, mMask;
1952 __m128i mAll_ones;
1953 __m128i mMinValue128i, mMaxValue128i;
1954
1955 // This is just used for andnot mInput
1956 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
1957 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
1958
1959 // start with really high min and really low max
1960 // What should happen if all values are restart?
1961 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
1962 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
1963 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
1964 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
1965
1966 // Initialize register used for testing for restart index.
1967 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
1968 (((UINT64) restart) << 48) |
1969 (((UINT64) restart) << 32) |
1970 (((UINT64) restart) << 16) |
1971 ((UINT64) restart);
1972
1973 while( count >= WordsPerPrefetch )
1974 {
1975 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
1976
1977 // Process cacheline values per pass
1978 count -= WordsPerPrefetch;
1979
1980 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
1981 {
1982 // Get double-quadword values
1983 mInput = *(__m128i*)pBuffer;
1984 pBuffer += WordsPerDoubleQuadWord;
1985
1986 // Make mask of non-restart_index fields
1987 mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
1988
1989 // Copy minimum and maximum fields for non-restarts
1990 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
1991 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
1992 }
1993 }
1994
1995 // Process double-quadword values per pass for remainder
1996 while( count >= WordsPerDoubleQuadWord )
1997 {
1998 // Process double-quadword values per pass
1999 count -= WordsPerDoubleQuadWord;
2000
2001 // Get double-quadword values
2002 mInput = *(__m128i*)pBuffer;
2003 pBuffer += WordsPerDoubleQuadWord;
2004
2005 // Make mask of non-restart_index fields
2006 mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
2007
2008 // Copy minimum and maximum fields for non-restarts
2009 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
2010 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
2011 }
2012
2013 // Determine wMinValue
2014
2015 // Extract each value in double-quadword to find minimum
2016 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2017 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
2018 wMinValue = Min( wMinValue, wValue );
2019 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
2020 wMinValue = Min( wMinValue, wValue );
2021 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
2022 wMinValue = Min( wMinValue, wValue );
2023 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
2024 wMinValue = Min( wMinValue, wValue );
2025 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
2026 wMinValue = Min( wMinValue, wValue );
2027 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
2028 wMinValue = Min( wMinValue, wValue );
2029 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
2030 wMinValue = Min( wMinValue, wValue );
2031 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
2032 wMinValue = Min( wMinValue, wValue );
2033
2034 // Determine wMaxValue
2035
2036 // Extract each value in double-quadword to find maximum
2037 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2038 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
2039 wMaxValue = Max( wMaxValue, wValue );
2040 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
2041 wMaxValue = Max( wMaxValue, wValue );
2042 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
2043 wMaxValue = Max( wMaxValue, wValue );
2044 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
2045 wMaxValue = Max( wMaxValue, wValue );
2046 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
2047 wMaxValue = Max( wMaxValue, wValue );
2048 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
2049 wMaxValue = Max( wMaxValue, wValue );
2050 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
2051 wMaxValue = Max( wMaxValue, wValue );
2052 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
2053 wMaxValue = Max( wMaxValue, wValue );
2054
2055 } // if( count >= WordsPerDoubleQuadWord )
2056 } // if( count >= WordsPerDoubleQuadWord )
2057 }
2058
2059 #endif // USE_SSE4_1
2060
2061 // Find min/max per value
2062 while( count > 0 )
2063 {
2064 count -= 1;
2065
2066 wValue = *pBuffer++;
2067
2068 if (wValue == restart) {
2069 continue;
2070 }
2071 wMinValue = Min( wMinValue, wValue );
2072 wMaxValue = Max( wMaxValue, wValue );
2073 }
2074
2075 min = wMinValue;
2076 max = wMaxValue;
2077 }
2078
2079
2080 /*****************************************************************************\
2081 Inline Function:
2082 FindDWordBufferMinMax
2083
2084 Description:
2085 Finds the min and max unsigned 32-bit values in the buffer
2086
2087 Input:
2088 DWORD* pBuffer - pointer to 32-bit buffer
2089 const DWORD bytes - size of buffer in bytes
2090
2091 Output:
2092 DWORD &min - minimum 32-bit value
2093 DWORD &max - maximum 32-bit value
2094
2095 \*****************************************************************************/
FindDWordBufferMinMax(DWORD * pBuffer,const DWORD bytes,DWORD & min,DWORD & max)2096 inline void FindDWordBufferMinMax(
2097 DWORD* pBuffer,
2098 const DWORD bytes,
2099 DWORD &min,
2100 DWORD &max )
2101 {
2102 PrefetchBuffer( (BYTE*)pBuffer, bytes );
2103
2104 DWORD wValue = 0;
2105 DWORD wMinValue = 0xffffffff;
2106 DWORD wMaxValue = 0x00000000;
2107
2108 DWORD count = bytes / sizeof(DWORD);
2109 DWORD i = 0;
2110
2111 if( IsAligned( pBuffer, sizeof(DWORD) ) )
2112 {
2113 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
2114 const DWORD DWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DWORD);
2115 const DWORD DWordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(DWORD);
2116
2117 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2118 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2119
2120 // Find min/max per cacheline of values
2121 if( count >= DWordsPerDoubleQuadWord )
2122 {
2123 const DWORD doubleQuadwordAlignWords =
2124 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2125
2126 // If pBuffer is not double-quadword aligned then process
2127 // until aligned
2128 if( doubleQuadwordAlignWords )
2129 {
2130 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2131 {
2132 wValue = *pBuffer++;
2133
2134 wMinValue = Min( wMinValue, wValue );
2135 wMaxValue = Max( wMaxValue, wValue );
2136 }
2137
2138 count -= doubleQuadwordAlignWords;
2139 }
2140
2141 // Find min/max per cacheline of values
2142 if( count >= DWordsPerPrefetch )
2143 {
2144 __m128i mValue128i;
2145 __m128 mValue128;
2146
2147 // Signed min/max initialization
2148 // need extra QWORD bits for SSE2 FP conversion
2149 __m128 mMinValue128 = _mm_set1_ps( (float)( (QWORD)wMinValue ) );
2150 __m128 mMaxValue128 = _mm_set1_ps( (float)( wMaxValue ) );
2151
2152 while( count >= DWordsPerPrefetch )
2153 {
2154 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2155
2156 // Process cacheline values per pass
2157 count -= DWordsPerPrefetch;
2158
2159 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2160 {
2161 // Get double-quadword values
2162 mValue128i = *(__m128i*)pBuffer;
2163 pBuffer += DWordsPerDoubleQuadWord;
2164
2165 // Convert to FP
2166 mValue128 = _mm_cvtepi32_ps( mValue128i );
2167
2168 // Determine parallel min/max
2169 mMinValue128 = _mm_min_ps( mMinValue128,
2170 mValue128 );
2171 mMaxValue128 = _mm_max_ps( mMaxValue128,
2172 mValue128 );
2173 }
2174 }
2175
2176 // Process double-quadword values per pass for remainder
2177 while( count >= DWordsPerDoubleQuadWord )
2178 {
2179 // Process double-quadword values per pass
2180 count -= DWordsPerDoubleQuadWord;
2181
2182 // Get double-quadword values
2183 mValue128i = *(__m128i*)pBuffer;
2184 pBuffer += DWordsPerDoubleQuadWord;
2185
2186 // Convert to FP
2187 mValue128 = _mm_cvtepi32_ps( mValue128i );
2188
2189 // Determine parallel min/max
2190 mMinValue128 = _mm_min_ps( mMinValue128,
2191 mValue128 );
2192 mMaxValue128 = _mm_max_ps( mMaxValue128,
2193 mValue128 );
2194 }
2195
2196 // Determine wMinValue
2197
2198 // Convert back to DWORD
2199 __m128i mMinValue128i = _mm_cvtps_epi32( mMinValue128 );
2200
2201 // Extract each value in double-quadword to find minimum
2202 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2203 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2204 wMinValue = Min( wMinValue, wValue );
2205 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2206 wValue = (DWORD)_mm_cvtsi128_si32(
2207 _mm_srli_si128( mMinValue128i, 4 ) );
2208 wMinValue = Min( wMinValue, wValue );
2209 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2210 wValue = (DWORD)_mm_cvtsi128_si32(
2211 _mm_srli_si128( mMinValue128i, 8 ) );
2212 wMinValue = Min( wMinValue, wValue );
2213 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2214 wValue = (DWORD)_mm_cvtsi128_si32(
2215 _mm_srli_si128( mMinValue128i, 12 ) );
2216 wMinValue = Min( wMinValue, wValue );
2217
2218 // Determine wMaxValue
2219
2220 // Convert back to DWORD
2221 __m128i mMaxValue128i = _mm_cvtps_epi32( mMaxValue128 );
2222
2223 // Extract each value in double-quadword to find maximum
2224 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2225 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2226 wMaxValue = Max( wMaxValue, wValue );
2227 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2228 wValue = (DWORD)_mm_cvtsi128_si32(
2229 _mm_srli_si128( mMaxValue128i, 4 ) );
2230 wMaxValue = Max( wMaxValue, wValue );
2231 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2232 wValue = (DWORD)_mm_cvtsi128_si32(
2233 _mm_srli_si128( mMaxValue128i, 8 ) );
2234 wMaxValue = Max( wMaxValue, wValue );
2235 // Grab element 3 from m128i reg: 3 | 2 | 1 | 0
2236 wValue = (DWORD)_mm_cvtsi128_si32(
2237 _mm_srli_si128( mMaxValue128i, 12 ) );
2238 wMaxValue = Max( wMaxValue, wValue );
2239
2240 } // if( count >= DWordsPerDoubleQuadWord )
2241 } // if( count >= DWordsPerDoubleQuadWord )
2242 }
2243
2244 // Find min/max per value
2245 while( count > 0 )
2246 {
2247 count -= 1;
2248
2249 wValue = *pBuffer++;
2250
2251 wMinValue = Min( wMinValue, wValue );
2252 wMaxValue = Max( wMaxValue, wValue );
2253 }
2254
2255 min = wMinValue;
2256 max = wMaxValue;
2257 }
2258
2259
2260 /*****************************************************************************\
2261 Inline Function:
2262 FindDWordBufferMinMaxRestart
2263
2264 Description:
2265 Finds the min and max unsigned 32-bit values in the buffer
2266 Excludes a restart value from min or max values
2267
2268 Input:
2269 DWORD* pBuffer - pointer to 32-bit buffer
2270 const DWORD bytes - size of buffer in bytes
2271 const DWORD restart - restart index to ignore
2272 cpuInstructionLevel - indicates if SSE_4.1 is available
2273
2274 Output:
2275 DWORD &min - minimum 32-bit value
2276 DWORD &max - maximum 32-bit value
2277
2278 \*****************************************************************************/
FindDWordBufferMinMaxRestart(DWORD * pBuffer,const DWORD bytes,const DWORD restart,DWORD & min,DWORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)2279 inline void FindDWordBufferMinMaxRestart(
2280 DWORD* pBuffer,
2281 const DWORD bytes,
2282 const DWORD restart,
2283 DWORD &min,
2284 DWORD &max,
2285 CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
2286 {
2287 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
2288
2289 DWORD wValue = 0;
2290 DWORD wMinValue = 0xffffffff;
2291 DWORD wMaxValue = 0x00000000;
2292
2293 DWORD count = bytes / sizeof(DWORD);
2294
2295 #ifdef USE_SSE4_1
2296
2297 DWORD i = 0;
2298
2299 if( IsAligned( pBuffer, sizeof(DWORD) ) &&
2300 cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
2301 {
2302 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
2303 const DWORD DWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DWORD);
2304 const DWORD DWordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(DWORD);
2305
2306 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2307 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2308
2309 // Find min/max per cacheline of values
2310 if( count >= DWordsPerDoubleQuadWord )
2311 {
2312 const DWORD doubleQuadwordAlignWords =
2313 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2314
2315 // If pBuffer is not double-quadword aligned then process
2316 // until aligned
2317 if( doubleQuadwordAlignWords )
2318 {
2319 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2320 {
2321 wValue = *pBuffer++;
2322
2323 if (wValue == restart) {
2324 continue;
2325 }
2326 wMinValue = Min( wMinValue, wValue );
2327 wMaxValue = Max( wMaxValue, wValue );
2328 }
2329
2330 count -= doubleQuadwordAlignWords;
2331 }
2332
2333 // Find min/max per cacheline of values
2334 if( count >= DWordsPerPrefetch )
2335 {
2336 __m128i mInput, mRestarts, mMask;
2337 __m128i mAll_ones;
2338 __m128i mMinValue128i, mMaxValue128i;
2339
2340 // This is just used for andnot mInput
2341 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
2342 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
2343
2344 // start with really high min and really low max
2345 // What should happen if all values are restart?
2346 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
2347 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
2348 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
2349 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
2350
2351 // Initialize register used for testing for restart index.
2352 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
2353
2354 while( count >= DWordsPerPrefetch )
2355 {
2356 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2357
2358 // Process cacheline values per pass
2359 count -= DWordsPerPrefetch;
2360
2361 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2362 {
2363 // Get double-quadword values
2364 mInput = *(__m128i*)pBuffer;
2365 pBuffer += DWordsPerDoubleQuadWord;
2366 // Make mask of non-restart_index fields
2367 mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
2368
2369 // Copy minimum and maximum fields for non-restarts
2370 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
2371 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
2372 }
2373 }
2374
2375 // Process double-quadword values per pass for remainder
2376 while( count >= DWordsPerDoubleQuadWord )
2377 {
2378 // Process double-quadword values per pass
2379 count -= DWordsPerDoubleQuadWord;
2380
2381 // Get double-quadword values
2382 mInput = *(__m128i*)pBuffer;
2383 pBuffer += DWordsPerDoubleQuadWord;
2384
2385 // Make mask of non-restart_index fields
2386 mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
2387
2388 // Copy minimum and maximum fields for non-restarts
2389 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
2390 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
2391 }
2392
2393 // Determine wMinValue
2394
2395 // Extract each value in double-quadword to find minimum
2396 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2397 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2398 wMinValue = Min( wMinValue, wValue );
2399 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2400 wValue = (DWORD)_mm_cvtsi128_si32(
2401 _mm_srli_si128( mMinValue128i, 4 ) );
2402 wMinValue = Min( wMinValue, wValue );
2403 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2404 wValue = (DWORD)_mm_cvtsi128_si32(
2405 _mm_srli_si128( mMinValue128i, 8 ) );
2406 wMinValue = Min( wMinValue, wValue );
2407 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2408 wValue = (DWORD)_mm_cvtsi128_si32(
2409 _mm_srli_si128( mMinValue128i, 12 ) );
2410 wMinValue = Min( wMinValue, wValue );
2411 // Determine wMaxValue
2412 // Extract each value in double-quadword to find maximum
2413 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2414 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2415 wMaxValue = Max( wMaxValue, wValue );
2416 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2417 wValue = (DWORD)_mm_cvtsi128_si32(
2418 _mm_srli_si128( mMaxValue128i, 4 ) );
2419 wMaxValue = Max( wMaxValue, wValue );
2420 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2421 wValue = (DWORD)_mm_cvtsi128_si32(
2422 _mm_srli_si128( mMaxValue128i, 8 ) );
2423 wMaxValue = Max( wMaxValue, wValue );
2424 // Grab element 3 from m128i reg: 3 | 2 | 1 | 0
2425 wValue = (DWORD)_mm_cvtsi128_si32(
2426 _mm_srli_si128( mMaxValue128i, 12 ) );
2427 wMaxValue = Max( wMaxValue, wValue );
2428
2429 } // if( count >= DWordsPerPrefetch )
2430 } // if( count >= DWordsPerDoubleQuadWord )
2431 }
2432
2433 #endif // USE_SSE4_1
2434
2435 // Find min/max per value
2436 while( count > 0 )
2437 {
2438 count -= 1;
2439
2440 wValue = *pBuffer++;
2441
2442 if (wValue == restart) {
2443 continue;
2444 }
2445 wMinValue = Min( wMinValue, wValue );
2446 wMaxValue = Max( wMaxValue, wValue );
2447 }
2448
2449 min = wMinValue;
2450 max = wMaxValue;
2451 }
2452
2453
2454
2455 /*****************************************************************************\
2456 Inline Function:
2457 FindWordBufferMinMaxCopy
2458
2459 Description:
2460 Finds the min and max unsigned 16-bit values in the buffer
2461 Copies data from pBuffer to pDest at the same time
2462
2463 Input:
2464 WORD* pDest - pointer to 16-bit buffer to copy into
2465 WORD* pBuffer - pointer to 16-bit index buffer
2466 const DWORD bytes - size of buffer in bytes
2467
2468 Output:
2469 WORD &min - minimum 16-bit value
2470 WORD &max - maximum 16-bit value
2471
2472 \*****************************************************************************/
FindWordBufferMinMaxCopy(WORD * pDest,WORD * pBuffer,const DWORD bytes,WORD & min,WORD & max)2473 inline void FindWordBufferMinMaxCopy(
2474 WORD* pDest,
2475 WORD* pBuffer,
2476 const DWORD bytes,
2477 WORD &min,
2478 WORD &max )
2479 {
2480 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
2481
2482 WORD wValue = 0;
2483 WORD wMinValue = 0xffff;
2484 WORD wMaxValue = 0x0000;
2485
2486 size_t count = bytes / sizeof(WORD);
2487 size_t i = 0;
2488
2489 if( IsAligned( pBuffer, sizeof(WORD) ) )
2490 {
2491 const size_t DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
2492 const size_t WordsPerPrefetch = sizeof(PREFETCH) / sizeof(WORD);
2493 const size_t WordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(WORD);
2494
2495 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2496 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2497
2498 // Find min/max per cacheline of values
2499 if( count >= WordsPerDoubleQuadWord )
2500 {
2501 const size_t doubleQuadwordAlignWords =
2502 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
2503
2504 // If pBuffer is not double-quadword aligned then process
2505 // until aligned
2506 if( doubleQuadwordAlignWords )
2507 {
2508 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2509 {
2510 wValue = *pDest++ = *pBuffer++;
2511
2512 wMinValue = Min( wMinValue, wValue );
2513 wMaxValue = Max( wMaxValue, wValue );
2514 }
2515
2516 count -= doubleQuadwordAlignWords;
2517 }
2518
2519 // Find min/max per cacheline of values
2520 if( count >= WordsPerDoubleQuadWord )
2521 {
2522 __m128i mValue128i;
2523
2524 // Need to convert unsigned values to signed values
2525 // since min/max is signed op
2526 __m128i mSignedScale128i = _mm_set1_epi16((WORD)0x8000);
2527
2528 // Signed min/max initialization
2529 __m128i mMinValue128i = _mm_set1_epi16(wMinValue-(WORD)0x8000);
2530 __m128i mMaxValue128i = _mm_set1_epi16(wMaxValue-(WORD)0x8000);
2531
2532 while( count >= WordsPerPrefetch )
2533 {
2534 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2535
2536 // Process cacheline values per pass
2537 count -= WordsPerPrefetch;
2538
2539 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2540 {
2541 // Get double-quadword values
2542 mValue128i = *(__m128i*)pBuffer;
2543 _mm_storeu_si128((__m128i*)pDest, mValue128i);
2544 pBuffer += WordsPerDoubleQuadWord;
2545 pDest += WordsPerDoubleQuadWord;
2546
2547 // Make values signed
2548 mValue128i = _mm_sub_epi16( mValue128i,
2549 mSignedScale128i );
2550
2551 // Determine parallel min/max
2552 mMinValue128i = _mm_min_epi16( mMinValue128i,
2553 mValue128i );
2554 mMaxValue128i = _mm_max_epi16( mMaxValue128i,
2555 mValue128i );
2556 }
2557 }
2558
2559 // Process double-quadword values per pass for remainder
2560 while( count >= WordsPerDoubleQuadWord )
2561 {
2562 // Process double-quadword values per pass
2563 count -= WordsPerDoubleQuadWord;
2564
2565 // Get double-quadword values
2566 mValue128i = *(__m128i*)pBuffer;
2567 _mm_storeu_si128((__m128i*)pDest, mValue128i);
2568 pBuffer += WordsPerDoubleQuadWord;
2569 pDest += WordsPerDoubleQuadWord;
2570
2571 // Make values signed
2572 mValue128i = _mm_sub_epi16( mValue128i,
2573 mSignedScale128i );
2574
2575 // Determine parallel min/max
2576 mMinValue128i = _mm_min_epi16( mMinValue128i,
2577 mValue128i );
2578 mMaxValue128i = _mm_max_epi16( mMaxValue128i,
2579 mValue128i );
2580 }
2581
2582 // Determine wMinValue
2583
2584 // Make values unsigned
2585 mMinValue128i = _mm_add_epi16( mMinValue128i,
2586 mSignedScale128i );
2587
2588 // Extract each value in double-quadword to find minimum
2589 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2590 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
2591 wMinValue = Min( wMinValue, wValue );
2592 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
2593 wMinValue = Min( wMinValue, wValue );
2594 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
2595 wMinValue = Min( wMinValue, wValue );
2596 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
2597 wMinValue = Min( wMinValue, wValue );
2598 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
2599 wMinValue = Min( wMinValue, wValue );
2600 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
2601 wMinValue = Min( wMinValue, wValue );
2602 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
2603 wMinValue = Min( wMinValue, wValue );
2604 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
2605 wMinValue = Min( wMinValue, wValue );
2606
2607 // Determine wMaxValue
2608
2609 // Make values unsigned
2610 mMaxValue128i = _mm_add_epi16( mMaxValue128i,
2611 mSignedScale128i );
2612
2613 // Extract each value in double-quadword to find maximum
2614 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
2615 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
2616 wMaxValue = Max( wMaxValue, wValue );
2617 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
2618 wMaxValue = Max( wMaxValue, wValue );
2619 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
2620 wMaxValue = Max( wMaxValue, wValue );
2621 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
2622 wMaxValue = Max( wMaxValue, wValue );
2623 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
2624 wMaxValue = Max( wMaxValue, wValue );
2625 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
2626 wMaxValue = Max( wMaxValue, wValue );
2627 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
2628 wMaxValue = Max( wMaxValue, wValue );
2629 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
2630 wMaxValue = Max( wMaxValue, wValue );
2631
2632 } // if( count >= WordsPerDoubleQuadWord )
2633 } // if( count >= WordsPerDoubleQuadWord )
2634 }
2635 #ifndef _WIN64
2636 else // if( IsAligned( pBuffer, sizeof(WORD) ) )
2637 {
2638 const size_t QuadWordsPerCacheline = sizeof(CACHELINE) / sizeof(QWORD);
2639 const size_t WordsPerCacheline = sizeof(CACHELINE) / sizeof(WORD);
2640 const size_t WordsPerQuadWord = sizeof(QWORD) / sizeof(WORD);
2641
2642 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
2643 Prefetch( (BYTE*)pBuffer + 2 * sizeof(CACHELINE) );
2644
2645 if( count >= WordsPerQuadWord )
2646 {
2647 __m64 mValue64;
2648
2649 // Need to convert unsigned values to signed values
2650 // since min/max is signed op
2651 __m64 mSignedScale64 = _mm_set1_pi16((WORD)0x8000);
2652
2653 // Signed min/max initialization
2654 __m64 mMinValue64 = _mm_set1_pi16(wMinValue-(WORD)0x8000);
2655 __m64 mMaxValue64 = _mm_set1_pi16(wMaxValue-(WORD)0x8000);
2656
2657 // Find min/max per cacheline of values
2658 while( count >= WordsPerCacheline )
2659 {
2660 Prefetch( (BYTE*)pBuffer + sizeof(CACHELINE) );
2661
2662 // Process cacheline values per pass
2663 count -= WordsPerCacheline;
2664
2665 for( i = 0; i < QuadWordsPerCacheline; i++ )
2666 {
2667 // Get quadword values
2668 mValue64 = *(__m64*)pBuffer;
2669 *(__m64*)pDest = mValue64;
2670 pBuffer += WordsPerQuadWord;
2671 pDest += WordsPerQuadWord;
2672
2673 // Make values signed
2674 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
2675
2676 // Determine parallel min/max
2677 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
2678 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
2679 }
2680 }
2681
2682 // Process quadword values per pass for remainder
2683 while( count >= WordsPerQuadWord )
2684 {
2685 // Process quadword values per pass
2686 count -= WordsPerQuadWord;
2687
2688 // Get quadword values
2689 mValue64 = *(__m64*)pBuffer;
2690 *(__m64*)pDest = mValue64;
2691 pBuffer += WordsPerQuadWord;
2692 pDest += WordsPerQuadWord;
2693
2694 // Make values signed
2695 mValue64 = _mm_sub_pi16( mValue64, mSignedScale64 );
2696
2697 // Determine parallel min/max
2698 mMinValue64 = _mm_min_pi16( mMinValue64, mValue64 );
2699 mMaxValue64 = _mm_max_pi16( mMaxValue64, mValue64 );
2700 }
2701
2702 // Determine wMinValue
2703
2704 // Make values unsigned
2705 mMinValue64 = _mm_add_pi16( mMinValue64, mSignedScale64 );
2706
2707 // Extract each value in quadword to find minimum
2708 // for( i = 0; i < WordsPerQuadWord; i++ )
2709 wValue = (WORD)_mm_extract_pi16( mMinValue64, 0 );
2710 wMinValue = Min( wMinValue, wValue );
2711 wValue = (WORD)_mm_extract_pi16( mMinValue64, 1 );
2712 wMinValue = Min( wMinValue, wValue );
2713 wValue = (WORD)_mm_extract_pi16( mMinValue64, 2 );
2714 wMinValue = Min( wMinValue, wValue );
2715 wValue = (WORD)_mm_extract_pi16( mMinValue64, 3 );
2716 wMinValue = Min( wMinValue, wValue );
2717
2718 // Determine wMaxValue
2719
2720 // Make values unsigned
2721 mMaxValue64 = _mm_add_pi16( mMaxValue64, mSignedScale64 );
2722
2723 // Extract each value in quadword to find maximum
2724 // for( i = 0; i < WordsPerQuadWord; i++ )
2725 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 0 );
2726 wMaxValue = Max( wMaxValue, wValue );
2727 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 1 );
2728 wMaxValue = Max( wMaxValue, wValue );
2729 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 2 );
2730 wMaxValue = Max( wMaxValue, wValue );
2731 wValue = (WORD)_mm_extract_pi16( mMaxValue64, 3 );
2732 wMaxValue = Max( wMaxValue, wValue );
2733
2734 _mm_empty();
2735
2736 } // if( count >= WordsPerQuadWord )
2737 }
2738 #endif
2739
2740 // Find min/max per value
2741 while( count > 0 )
2742 {
2743 count -= 1;
2744
2745 wValue = *pDest++ = *pBuffer++;
2746
2747 wMinValue = Min( wMinValue, wValue );
2748 wMaxValue = Max( wMaxValue, wValue );
2749 }
2750
2751 min = wMinValue;
2752 max = wMaxValue;
2753 }
2754
2755 /*****************************************************************************\
2756 Inline Function:
2757 FindDWordBufferMinMaxCopy
2758
2759 Description:
2760 Finds the min and max unsigned 32-bit values in the buffer
2761 Copies data from pBuffer to pDest at the same time
2762
2763 Input:
2764 DWORD* pDest - pointer to 32-bit buffer to copy into
2765 DWORD* pBuffer - pointer to 32-bit buffer
2766 const DWORD bytes - size of buffer in bytes
2767
2768 Output:
2769 WORD &min - minimum 32-bit value
2770 WORD &max - maximum 32-bit value
2771
2772 \*****************************************************************************/
FindDWordBufferMinMaxCopy(DWORD * pDest,DWORD * pBuffer,const DWORD bytes,DWORD & min,DWORD & max)2773 inline void FindDWordBufferMinMaxCopy(
2774 DWORD* pDest,
2775 DWORD* pBuffer,
2776 const DWORD bytes,
2777 DWORD &min,
2778 DWORD &max )
2779 {
2780 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
2781
2782 DWORD wValue = 0;
2783 DWORD wMinValue = 0xffffffff;
2784 DWORD wMaxValue = 0x00000000;
2785
2786 DWORD count = bytes / sizeof(DWORD);
2787 DWORD i = 0;
2788
2789 if( IsAligned( pBuffer, sizeof(DWORD) ) )
2790 {
2791 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
2792 const DWORD DWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DWORD);
2793 const DWORD DWordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(DWORD);
2794
2795 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2796 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2797
2798 // Find min/max per cacheline of values
2799 if( count >= DWordsPerDoubleQuadWord )
2800 {
2801 const DWORD doubleQuadwordAlignWords =
2802 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
2803
2804 // If pBuffer is not double-quadword aligned then process
2805 // until aligned
2806 if( doubleQuadwordAlignWords )
2807 {
2808 for( i = 0; i < doubleQuadwordAlignWords; i++ )
2809 {
2810 wValue = *pDest++ = *pBuffer++;
2811
2812 wMinValue = Min( wMinValue, wValue );
2813 wMaxValue = Max( wMaxValue, wValue );
2814 }
2815
2816 count -= doubleQuadwordAlignWords;
2817 }
2818
2819 // Find min/max per cacheline of values
2820 if( count >= DWordsPerDoubleQuadWord )
2821 {
2822 __m128i mValue128i;
2823 __m128 mValue128;
2824
2825 // Signed min/max initialization
2826 // need extra QWORD bits for SSE2 FP conversion
2827 __m128 mMinValue128 = _mm_set1_ps( (float)( (QWORD)wMinValue ) );
2828 __m128 mMaxValue128 = _mm_set1_ps( (float)( wMaxValue ) );
2829
2830 while( count >= DWordsPerPrefetch )
2831 {
2832 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2833
2834 // Process cacheline values per pass
2835 count -= DWordsPerPrefetch;
2836
2837 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
2838 {
2839 // Get double-quadword values
2840 mValue128i = *(__m128i*)pBuffer;
2841 _mm_storeu_si128((__m128i*)pDest, mValue128i);
2842 pBuffer += DWordsPerDoubleQuadWord;
2843 pDest += DWordsPerDoubleQuadWord;
2844
2845 // Convert to FP
2846 mValue128 = _mm_cvtepi32_ps( mValue128i );
2847
2848 // Determine parallel min/max
2849 mMinValue128 = _mm_min_ps( mMinValue128,
2850 mValue128 );
2851 mMaxValue128 = _mm_max_ps( mMaxValue128,
2852 mValue128 );
2853 }
2854 }
2855
2856 // Process double-quadword values per pass for remainder
2857 while( count >= DWordsPerDoubleQuadWord )
2858 {
2859 // Process double-quadword values per pass
2860 count -= DWordsPerDoubleQuadWord;
2861
2862 // Get double-quadword values
2863 mValue128i = *(__m128i*)pBuffer;
2864 _mm_storeu_si128((__m128i*)pDest, mValue128i);
2865 pBuffer += DWordsPerDoubleQuadWord;
2866 pDest += DWordsPerDoubleQuadWord;
2867
2868 // Convert to FP
2869 mValue128 = _mm_cvtepi32_ps( mValue128i );
2870
2871 // Determine parallel min/max
2872 mMinValue128 = _mm_min_ps( mMinValue128,
2873 mValue128 );
2874 mMaxValue128 = _mm_max_ps( mMaxValue128,
2875 mValue128 );
2876 }
2877
2878 // Determine wMinValue
2879
2880 // Convert back to DWORD
2881 __m128i mMinValue128i = _mm_cvtps_epi32( mMinValue128 );
2882
2883 // Extract each value in double-quadword to find minimum
2884 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2885 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
2886 wMinValue = Min( wMinValue, wValue );
2887 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2888 wValue = (DWORD)_mm_cvtsi128_si32(
2889 _mm_srli_si128( mMinValue128i, 4 ) );
2890 wMinValue = Min( wMinValue, wValue );
2891 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2892 wValue = (DWORD)_mm_cvtsi128_si32(
2893 _mm_srli_si128( mMinValue128i, 8 ) );
2894 wMinValue = Min( wMinValue, wValue );
2895 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2896 wValue = (DWORD)_mm_cvtsi128_si32(
2897 _mm_srli_si128( mMinValue128i, 12 ) );
2898 wMinValue = Min( wMinValue, wValue );
2899
2900 // Determine wMaxValue
2901
2902 // Convert back to DWORD
2903 __m128i mMaxValue128i = _mm_cvtps_epi32( mMaxValue128 );
2904
2905 // Extract each value in double-quadword to find maximum
2906 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
2907 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
2908 wMaxValue = Max( wMaxValue, wValue );
2909 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
2910 wValue = (DWORD)_mm_cvtsi128_si32(
2911 _mm_srli_si128( mMaxValue128i, 4 ) );
2912 wMaxValue = Max( wMaxValue, wValue );
2913 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
2914 wValue = (DWORD)_mm_cvtsi128_si32(
2915 _mm_srli_si128( mMaxValue128i, 8 ) );
2916 wMaxValue = Max( wMaxValue, wValue );
2917 // Grab element 3 from m128i reg: 3 | 2 | 1 | 0
2918 wValue = (DWORD)_mm_cvtsi128_si32(
2919 _mm_srli_si128( mMaxValue128i, 12 ) );
2920 wMaxValue = Max( wMaxValue, wValue );
2921
2922 } // if( count >= DWordsPerDoubleQuadWord )
2923 } // if( count >= DWordsPerDoubleQuadWord )
2924 }
2925
2926 // Find min/max per value
2927 while( count > 0 )
2928 {
2929 count -= 1;
2930
2931 wValue = *pDest++ = *pBuffer++;
2932
2933 wMinValue = Min( wMinValue, wValue );
2934 wMaxValue = Max( wMaxValue, wValue );
2935 }
2936
2937 min = wMinValue;
2938 max = wMaxValue;
2939 }
2940
2941
2942 /*****************************************************************************\
2943 Inline Function:
2944 FindWordBufferMinMaxRestartCoy
2945
2946 Description:
2947 Finds the min and max unsigned 32-bit values in the buffer
2948 Excludes a restart value from min or max values
2949 Copies data from pBuffer to pDest at the same time
2950
2951 Input:
2952 WORD* pDest - pointer to 32-bit buffer to copy into
2953 WORD* pBuffer - pointer to 32-bit buffer
2954 const DWORD bytes - size of buffer in bytes
2955 const WORD restart - restart index to ignore
2956 cpuInstructionLevel - indicates if SSE_4.1 is available
2957
2958 Output:
2959 WORD &min - minimum 32-bit value
2960 WORD &max - maximum 32-bit value
2961
2962 \*****************************************************************************/
FindWordBufferMinMaxRestartCopy(WORD * pDest,WORD * pBuffer,const DWORD bytes,const WORD restart,WORD & min,WORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)2963 inline void FindWordBufferMinMaxRestartCopy(
2964 WORD* pDest,
2965 WORD* pBuffer,
2966 const DWORD bytes,
2967 const WORD restart,
2968 WORD &min,
2969 WORD &max,
2970 CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
2971 {
2972 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
2973
2974 WORD wValue = 0;
2975 WORD wMinValue = 0xffff;
2976 WORD wMaxValue = 0x0000;
2977
2978 size_t count = bytes / sizeof(WORD);
2979
2980 #ifdef USE_SSE4_1
2981
2982 size_t i = 0;
2983
2984 if( IsAligned( pBuffer, sizeof(WORD) ) &&
2985 cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
2986 {
2987 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
2988 const DWORD WordsPerPrefetch = sizeof(PREFETCH) / sizeof(WORD);
2989 const DWORD WordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(WORD);
2990
2991 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
2992 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
2993
2994 // Find min/max per cacheline of values
2995 if( count >= WordsPerDoubleQuadWord )
2996 {
2997 const size_t doubleQuadwordAlignWords =
2998 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(WORD);
2999
3000 // If pBuffer is not double-quadword aligned then process
3001 // until aligned
3002 if( doubleQuadwordAlignWords )
3003 {
3004 for( i = 0; i < doubleQuadwordAlignWords; i++ )
3005 {
3006 wValue = *pDest++ = *pBuffer++;
3007
3008 if (wValue == restart) {
3009 continue;
3010 }
3011 wMinValue = Min( wMinValue, wValue );
3012 wMaxValue = Max( wMaxValue, wValue );
3013 }
3014
3015 count -= doubleQuadwordAlignWords;
3016 }
3017
3018 // Find min/max per cacheline of values
3019 if( count >= WordsPerDoubleQuadWord )
3020 {
3021 __m128i mInput, mRestarts, mMask;
3022 __m128i mAll_ones;
3023 __m128i mMinValue128i, mMaxValue128i;
3024
3025 // This is just used for andnot mInput
3026 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3027 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3028
3029 // start with really high min and really low max
3030 // What should happen if all values are restart?
3031 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3032 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3033 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
3034 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
3035
3036 // Initialize register used for testing for restart index.
3037 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] =
3038 (((UINT64) restart) << 48) |
3039 (((UINT64) restart) << 32) |
3040 (((UINT64) restart) << 16) |
3041 ((UINT64) restart);
3042
3043 while( count >= WordsPerPrefetch )
3044 {
3045 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3046
3047 // Process cacheline values per pass
3048 count -= WordsPerPrefetch;
3049
3050 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
3051 {
3052 // Get double-quadword values
3053 mInput = *(__m128i*)pBuffer;
3054 _mm_storeu_si128((__m128i*)pDest, mInput);
3055 pBuffer += WordsPerDoubleQuadWord;
3056 pDest += WordsPerDoubleQuadWord;
3057
3058 // Make mask of non-restart_index fields
3059 mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
3060
3061 // Copy minimum and maximum fields for non-restarts
3062 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
3063 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
3064 }
3065 }
3066
3067 // Process double-quadword values per pass for remainder
3068 while( count >= WordsPerDoubleQuadWord )
3069 {
3070 // Process double-quadword values per pass
3071 count -= WordsPerDoubleQuadWord;
3072
3073 // Get double-quadword values
3074 mInput = *(__m128i*)pBuffer;
3075 _mm_storeu_si128((__m128i*)pDest, mInput);
3076 pBuffer += WordsPerDoubleQuadWord;
3077 pDest += WordsPerDoubleQuadWord;
3078
3079 // Make mask of non-restart_index fields
3080 mMask = _mm_andnot_si128(_mm_cmpeq_epi16(mInput, mRestarts), mAll_ones);
3081
3082 // Copy minimum and maximum fields for non-restarts
3083 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu16(mMinValue128i, mInput), mMask );
3084 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu16(mMaxValue128i, mInput), mMask );
3085 }
3086
3087 // Determine wMinValue
3088
3089 // Extract each value in double-quadword to find minimum
3090 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
3091 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 0 );
3092 wMinValue = Min( wMinValue, wValue );
3093 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 1 );
3094 wMinValue = Min( wMinValue, wValue );
3095 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 2 );
3096 wMinValue = Min( wMinValue, wValue );
3097 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 3 );
3098 wMinValue = Min( wMinValue, wValue );
3099 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 4 );
3100 wMinValue = Min( wMinValue, wValue );
3101 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 5 );
3102 wMinValue = Min( wMinValue, wValue );
3103 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 6 );
3104 wMinValue = Min( wMinValue, wValue );
3105 wValue = (WORD)_mm_extract_epi16( mMinValue128i, 7 );
3106 wMinValue = Min( wMinValue, wValue );
3107
3108 // Determine wMaxValue
3109
3110 // Extract each value in double-quadword to find maximum
3111 // for( i = 0; i < WordsPerDoubleQuadWord; i++ )
3112 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 0 );
3113 wMaxValue = Max( wMaxValue, wValue );
3114 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 1 );
3115 wMaxValue = Max( wMaxValue, wValue );
3116 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 2 );
3117 wMaxValue = Max( wMaxValue, wValue );
3118 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 3 );
3119 wMaxValue = Max( wMaxValue, wValue );
3120 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 4 );
3121 wMaxValue = Max( wMaxValue, wValue );
3122 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 5 );
3123 wMaxValue = Max( wMaxValue, wValue );
3124 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 6 );
3125 wMaxValue = Max( wMaxValue, wValue );
3126 wValue = (WORD)_mm_extract_epi16( mMaxValue128i, 7 );
3127 wMaxValue = Max( wMaxValue, wValue );
3128
3129 } // if( count >= WordsPerDoubleQuadWord )
3130 } // if( count >= WordsPerDoubleQuadWord )
3131 }
3132
3133 #endif // USE_SSE4_1
3134
3135 // Find min/max per value
3136 while( count > 0 )
3137 {
3138 count -= 1;
3139
3140 wValue = *pDest++ = *pBuffer++;
3141
3142 if (wValue == restart) {
3143 continue;
3144 }
3145 wMinValue = Min( wMinValue, wValue );
3146 wMaxValue = Max( wMaxValue, wValue );
3147 }
3148
3149 min = wMinValue;
3150 max = wMaxValue;
3151 }
3152
3153
3154 /*****************************************************************************\
3155 Inline Function:
3156 FindDWordBufferMinMaxRestartCopy
3157
3158 Description:
3159 Finds the min and max unsigned 32-bit values in the buffer
3160 Excludes a restart value from min or max values
3161 Copies data from pBuffer to pDest at the same time
3162
3163 Input:
3164 DWORD* pDest - pointer to 32-bit buffer to copy into
3165 DWORD* pBuffer - pointer to 32-bit index buffer
3166 const DWORD bytes - size of buffer in bytes
3167 const DWORD restart - restart index to ignore
3168 cpuInstructionLevel - indicates if SSE_4.1 is available
3169
3170 Output:
3171 DWORD &min - minimum 32-bit value
3172 DWORD &max - maximum 32-bit value
3173
3174 \*****************************************************************************/
FindDWordBufferMinMaxRestartCopy(DWORD * pDest,DWORD * pBuffer,const DWORD bytes,const DWORD restart,DWORD & min,DWORD & max,CPU_INSTRUCTION_LEVEL cpuInstructionLevel)3175 inline void FindDWordBufferMinMaxRestartCopy(
3176 DWORD* pDest,
3177 DWORD* pBuffer,
3178 const DWORD bytes,
3179 const DWORD restart,
3180 DWORD &min,
3181 DWORD &max,
3182 CPU_INSTRUCTION_LEVEL cpuInstructionLevel )
3183 {
3184 // PrefetchBuffer( (BYTE*)pBuffer, bytes );
3185
3186 DWORD wValue = 0;
3187 DWORD wMinValue = 0xffffffff;
3188 DWORD wMaxValue = 0x00000000;
3189
3190 DWORD count = bytes / sizeof(DWORD);
3191
3192 #ifdef USE_SSE4_1
3193
3194 DWORD i = 0;
3195
3196 if( IsAligned( pBuffer, sizeof(DWORD) ) &&
3197 cpuInstructionLevel >= CPU_INSTRUCTION_LEVEL_SSE4_1 )
3198 {
3199 const DWORD DoubleQuadWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DQWORD);
3200 const DWORD DWordsPerPrefetch = sizeof(PREFETCH) / sizeof(DWORD);
3201 const DWORD DWordsPerDoubleQuadWord = sizeof(DQWORD) / sizeof(DWORD);
3202
3203 Prefetch( (BYTE*)pBuffer + sizeof(PREFETCH) );
3204 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3205
3206 // Find min/max per cacheline of values
3207 if( count >= DWordsPerDoubleQuadWord )
3208 {
3209 const DWORD doubleQuadwordAlignWords =
3210 GetAlignmentOffset( pBuffer, sizeof(DQWORD) ) / sizeof(DWORD);
3211
3212 // If pBuffer is not double-quadword aligned then process
3213 // until aligned
3214 if( doubleQuadwordAlignWords )
3215 {
3216 for( i = 0; i < doubleQuadwordAlignWords; i++ )
3217 {
3218 wValue = *pDest++ = *pBuffer++;
3219
3220 if (wValue == restart) {
3221 continue;
3222 }
3223 wMinValue = Min( wMinValue, wValue );
3224 wMaxValue = Max( wMaxValue, wValue );
3225 }
3226
3227 count -= doubleQuadwordAlignWords;
3228 }
3229
3230 // Find min/max per cacheline of values
3231 if( count >= DWordsPerPrefetch )
3232 {
3233 __m128i mInput, mRestarts, mMask;
3234 __m128i mAll_ones;
3235 __m128i mMinValue128i, mMaxValue128i;
3236
3237 // This is just used for andnot mInput
3238 mAll_ones.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3239 mAll_ones.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3240
3241 // start with really high min and really low max
3242 // What should happen if all values are restart?
3243 mMinValue128i.m128i_u64[0] = 0xFFFFFFFFFFFFFFFF;
3244 mMinValue128i.m128i_u64[1] = 0xFFFFFFFFFFFFFFFF;
3245 mMaxValue128i.m128i_u64[0] = 0x0000000000000000;
3246 mMaxValue128i.m128i_u64[1] = 0x0000000000000000;
3247
3248 // Initialize register used for testing for restart index.
3249 mRestarts.m128i_u64[0] = mRestarts.m128i_u64[1] = (((UINT64) restart) << 32) | ((UINT64) restart);
3250
3251 while( count >= DWordsPerPrefetch )
3252 {
3253 Prefetch( (BYTE*)pBuffer + 2 * sizeof(PREFETCH) );
3254
3255 // Process cacheline values per pass
3256 count -= DWordsPerPrefetch;
3257
3258 for( i = 0; i < DoubleQuadWordsPerPrefetch; i++ )
3259 {
3260 // Get double-quadword values
3261 mInput = *(__m128i*)pBuffer;
3262 _mm_storeu_si128((__m128i*)pDest, mInput);
3263 pBuffer += DWordsPerDoubleQuadWord;
3264 pDest += DWordsPerDoubleQuadWord;
3265
3266 // Make mask of non-restart_index fields
3267 mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
3268
3269 // Copy minimum and maximum fields for non-restarts
3270 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
3271 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
3272 }
3273 }
3274
3275 // Process double-quadword values per pass for remainder
3276 while( count >= DWordsPerDoubleQuadWord )
3277 {
3278 // Process double-quadword values per pass
3279 count -= DWordsPerDoubleQuadWord;
3280
3281 // Get double-quadword values
3282 mInput = *(__m128i*)pBuffer;
3283 _mm_storeu_si128((__m128i*)pDest, mInput);
3284 pBuffer += DWordsPerDoubleQuadWord;
3285 pDest += DWordsPerDoubleQuadWord;
3286
3287 // Make mask of non-restart_index fields
3288 mMask = _mm_andnot_si128(_mm_cmpeq_epi32(mInput, mRestarts), mAll_ones);
3289
3290 // Copy minimum and maximum fields for non-restarts
3291 mMinValue128i = _mm_blendv_epi8(mMinValue128i, _mm_min_epu32(mMinValue128i, mInput), mMask );
3292 mMaxValue128i = _mm_blendv_epi8(mMaxValue128i, _mm_max_epu32(mMaxValue128i, mInput), mMask );
3293 }
3294
3295 // Determine wMinValue
3296
3297 // Extract each value in double-quadword to find minimum
3298 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
3299 wValue = (DWORD)_mm_cvtsi128_si32( mMinValue128i );
3300 wMinValue = Min( wMinValue, wValue );
3301 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
3302 wValue = (DWORD)_mm_cvtsi128_si32(
3303 _mm_srli_si128( mMinValue128i, 4 ) );
3304 wMinValue = Min( wMinValue, wValue );
3305 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
3306 wValue = (DWORD)_mm_cvtsi128_si32(
3307 _mm_srli_si128( mMinValue128i, 8 ) );
3308 wMinValue = Min( wMinValue, wValue );
3309 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
3310 wValue = (DWORD)_mm_cvtsi128_si32(
3311 _mm_srli_si128( mMinValue128i, 12 ) );
3312 wMinValue = Min( wMinValue, wValue );
3313
3314 // Determine wMaxValue
3315
3316 // Extract each value in double-quadword to find maximum
3317 // Grab element 0 from m128i reg: 3 | 2 | 1 | 0
3318 wValue = (DWORD)_mm_cvtsi128_si32( mMaxValue128i );
3319 wMaxValue = Max( wMaxValue, wValue );
3320 // Grab element 1 from m128i reg: 3 | 2 | 1 | 0
3321 wValue = (DWORD)_mm_cvtsi128_si32(
3322 _mm_srli_si128( mMaxValue128i, 4 ) );
3323 wMaxValue = Max( wMaxValue, wValue );
3324 // Grab element 2 from m128i reg: 3 | 2 | 1 | 0
3325 wValue = (DWORD)_mm_cvtsi128_si32(
3326 _mm_srli_si128( mMaxValue128i, 8 ) );
3327 wMaxValue = Max( wMaxValue, wValue );
3328 // Grab element 3 from m128i reg: 3 | 2 | 1 | 0
3329 wValue = (DWORD)_mm_cvtsi128_si32(
3330 _mm_srli_si128( mMaxValue128i, 12 ) );
3331 wMaxValue = Max( wMaxValue, wValue );
3332
3333 } // if( count >= DWordsPerPrefetch )
3334 } // if( count >= DWordsPerDoubleQuadWord )
3335 }
3336
3337 #endif // USE_SSE4_1
3338
3339 // Find min/max per value
3340 while( count > 0 )
3341 {
3342 count -= 1;
3343
3344 wValue = *pDest++ = *pBuffer++;
3345
3346 if (wValue == restart) {
3347 continue;
3348 }
3349 wMinValue = Min( wMinValue, wValue );
3350 wMaxValue = Max( wMaxValue, wValue );
3351 }
3352
3353 min = wMinValue;
3354 max = wMaxValue;
3355 }
3356
3357
3358 } // iSTD
3359