1/************************************************************************
2*                                                                       *
3* xnamathconvert.inl -- SIMD C++ Math library for Windows and Xbox 360  *
4*                       Conversion, loading, and storing functions      *
5*                                                                       *
6* Copyright (c) Microsoft Corp. All rights reserved.                    *
7*                                                                       *
8************************************************************************/
9
10#if defined(_MSC_VER) && (_MSC_VER > 1000)
11#pragma once
12#endif
13
14#ifndef __XNAMATHCONVERT_INL__
15#define __XNAMATHCONVERT_INL__
16
17#define XM_PACK_FACTOR                  (FLOAT)(1 << 22)
18#define XM_UNPACK_FACTOR_UNSIGNED       (FLOAT)(1 << 23)
19#define XM_UNPACK_FACTOR_SIGNED         XM_PACK_FACTOR
20
21#define XM_UNPACK_UNSIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
22                                        {-XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
23                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
24                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
25                                         -XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
26
27#define XM_UNPACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
28                                        {XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsX)) - 1), \
29                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsY)) - 1), \
30                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsZ)) - 1), \
31                                         XM_UNPACK_FACTOR_UNSIGNED / (FLOAT)((1 << (BitsW)) - 1)}
32
33#define XM_UNPACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
34                                        {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1), \
35                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1), \
36                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1), \
37                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1)}
38
39//#define XM_UNPACK_SIGNEDN_OFFSET(BitsX, BitsY, BitsZ, BitsW) \
40//                                        {-XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsX) - 1)) - 1) * 3.0f, \
41//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsY) - 1)) - 1) * 3.0f, \
42//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsZ) - 1)) - 1) * 3.0f, \
43//                                         -XM_UNPACK_FACTOR_SIGNED / (FLOAT)((1 << ((BitsW) - 1)) - 1) * 3.0f}
44
45#define XM_PACK_UNSIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
46                                        {-(FLOAT)((1 << (BitsX)) - 1) / XM_PACK_FACTOR, \
47                                         -(FLOAT)((1 << (BitsY)) - 1) / XM_PACK_FACTOR, \
48                                         -(FLOAT)((1 << (BitsZ)) - 1) / XM_PACK_FACTOR, \
49                                         -(FLOAT)((1 << (BitsW)) - 1) / XM_PACK_FACTOR}
50
51#define XM_PACK_SIGNEDN_SCALE(BitsX, BitsY, BitsZ, BitsW) \
52                                        {-(FLOAT)((1 << ((BitsX) - 1)) - 1) / XM_PACK_FACTOR, \
53                                         -(FLOAT)((1 << ((BitsY) - 1)) - 1) / XM_PACK_FACTOR, \
54                                         -(FLOAT)((1 << ((BitsZ) - 1)) - 1) / XM_PACK_FACTOR, \
55                                         -(FLOAT)((1 << ((BitsW) - 1)) - 1) / XM_PACK_FACTOR}
56
57#define XM_PACK_OFFSET                  XMVectorSplatConstant(3, 0)
58//#define XM_UNPACK_OFFSET                XM_PACK_OFFSET
59
60/****************************************************************************
61 *
62 * Data conversion
63 *
64 ****************************************************************************/
65
66//------------------------------------------------------------------------------
67
68XMFINLINE FLOAT XMConvertHalfToFloat
69(
70    HALF Value
71)
72{
73#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
74
75    UINT Mantissa;
76    UINT Exponent;
77    UINT Result;
78
79    Mantissa = (UINT)(Value & 0x03FF);
80
81    if ((Value & 0x7C00) != 0)  // The value is normalized
82    {
83        Exponent = (UINT)((Value >> 10) & 0x1F);
84    }
85    else if (Mantissa != 0)     // The value is denormalized
86    {
87        // Normalize the value in the resulting float
88        Exponent = 1;
89
90        do
91        {
92            Exponent--;
93            Mantissa <<= 1;
94        } while ((Mantissa & 0x0400) == 0);
95
96        Mantissa &= 0x03FF;
97    }
98    else                        // The value is zero
99    {
100        Exponent = (UINT)-112;
101    }
102
103    Result = ((Value & 0x8000) << 16) | // Sign
104             ((Exponent + 112) << 23) | // Exponent
105             (Mantissa << 13);          // Mantissa
106
107    return *(FLOAT*)&Result;
108
109#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
110#endif
111}
112
113//------------------------------------------------------------------------------
114
115XMINLINE FLOAT* XMConvertHalfToFloatStream
116(
117    FLOAT*      pOutputStream,
118    size_t      OutputStride,
119    CONST HALF* pInputStream,
120    size_t      InputStride,
121    size_t      HalfCount
122)
123{
124#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
125
126    size_t  i;
127    CONST BYTE* pHalf = (CONST BYTE*)pInputStream;
128    BYTE* pFloat = (BYTE*)pOutputStream;
129
130    XMASSERT(pOutputStream);
131    XMASSERT(pInputStream);
132
133    for (i = 0; i < HalfCount; i++)
134    {
135        *(FLOAT*)pFloat = XMConvertHalfToFloat(*(const HALF*)pHalf);
136        pHalf += InputStride;
137        pFloat += OutputStride;
138    }
139
140    return pOutputStream;
141
142#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
143#endif // _XM_VMX128_INTRINSICS_
144}
145
146//------------------------------------------------------------------------------
147
148XMFINLINE HALF XMConvertFloatToHalf
149(
150    FLOAT Value
151)
152{
153#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
154    UINT Result;
155
156    UINT IValue = ((UINT *)(&Value))[0];
157    UINT Sign = (IValue & 0x80000000U) >> 16U;
158    IValue = IValue & 0x7FFFFFFFU;      // Hack off the sign
159
160    if (IValue > 0x47FFEFFFU)
161    {
162        // The number is too large to be represented as a half.  Saturate to infinity.
163        Result = 0x7FFFU;
164    }
165    else
166    {
167        if (IValue < 0x38800000U)
168        {
169            // The number is too small to be represented as a normalized half.
170            // Convert it to a denormalized value.
171            UINT Shift = 113U - (IValue >> 23U);
172            IValue = (0x800000U | (IValue & 0x7FFFFFU)) >> Shift;
173        }
174        else
175        {
176            // Rebias the exponent to represent the value as a normalized half.
177            IValue += 0xC8000000U;
178        }
179
180        Result = ((IValue + 0x0FFFU + ((IValue >> 13U) & 1U)) >> 13U)&0x7FFFU;
181    }
182    return (HALF)(Result|Sign);
183
184#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
185#endif
186}
187
188//------------------------------------------------------------------------------
189
190XMINLINE HALF* XMConvertFloatToHalfStream
191(
192    HALF*        pOutputStream,
193    size_t       OutputStride,
194    CONST FLOAT* pInputStream,
195    size_t       InputStride,
196    size_t       FloatCount
197)
198{
199#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
200
201    size_t  i;
202    BYTE* pFloat = (BYTE*)pInputStream;
203    BYTE* pHalf = (BYTE*)pOutputStream;
204
205    XMASSERT(pOutputStream);
206    XMASSERT(pInputStream);
207
208    for (i = 0; i < FloatCount; i++)
209    {
210        *(HALF*)pHalf = XMConvertFloatToHalf(*(FLOAT*)pFloat);
211        pFloat += InputStride;
212        pHalf += OutputStride;
213    }
214    return pOutputStream;
215#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
216#endif // _XM_VMX128_INTRINSICS_
217}
218
219//------------------------------------------------------------------------------
220
221#if defined(_XM_NO_INTRINSICS_) || defined(_XM_SSE_INTRINSICS_)
222// For VMX128, these routines are all defines in the main header
223
224#pragma warning(push)
225#pragma warning(disable:4701) // Prevent warnings about 'Result' potentially being used without having been initialized
226
227XMINLINE XMVECTOR XMConvertVectorIntToFloat
228(
229    FXMVECTOR VInt,
230    UINT     DivExponent
231)
232{
233#if defined(_XM_NO_INTRINSICS_)
234    UINT ElementIndex;
235    FLOAT fScale;
236    XMVECTOR Result;
237    XMASSERT(DivExponent<32);
238    fScale = 1.0f / (FLOAT)(1U << DivExponent);
239    ElementIndex = 0;
240    do {
241        INT iTemp = (INT)VInt.vector4_u32[ElementIndex];
242        Result.vector4_f32[ElementIndex] = ((FLOAT)iTemp) * fScale;
243    } while (++ElementIndex<4);
244    return Result;
245#else // _XM_SSE_INTRINSICS_
246    XMASSERT(DivExponent<32);
247    // Convert to floats
248    XMVECTOR vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&VInt)[0]);
249    // Convert DivExponent into 1.0f/(1<<DivExponent)
250    UINT uScale = 0x3F800000U - (DivExponent << 23);
251    // Splat the scalar value
252    __m128i vScale = _mm_set1_epi32(uScale);
253    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&vScale)[0]);
254    return vResult;
255#endif
256}
257
258//------------------------------------------------------------------------------
259
260XMINLINE XMVECTOR XMConvertVectorFloatToInt
261(
262    FXMVECTOR VFloat,
263    UINT     MulExponent
264)
265{
266#if defined(_XM_NO_INTRINSICS_)
267    UINT ElementIndex;
268    XMVECTOR Result;
269    FLOAT fScale;
270    XMASSERT(MulExponent<32);
271    // Get the scalar factor.
272    fScale = (FLOAT)(1U << MulExponent);
273    ElementIndex = 0;
274    do {
275        INT iResult;
276        FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
277        if (fTemp <= -(65536.0f*32768.0f)) {
278            iResult = (-0x7FFFFFFF)-1;
279        } else if (fTemp > (65536.0f*32768.0f)-128.0f) {
280            iResult = 0x7FFFFFFF;
281        } else {
282            iResult = (INT)fTemp;
283        }
284        Result.vector4_u32[ElementIndex] = (UINT)iResult;
285    } while (++ElementIndex<4);
286    return Result;
287#else // _XM_SSE_INTRINSICS_
288    XMASSERT(MulExponent<32);
289    XMVECTOR vResult = _mm_set_ps1((FLOAT)(1U << MulExponent));
290    vResult = _mm_mul_ps(vResult,VFloat);
291    // In case of positive overflow, detect it
292    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxInt);
293    // Float to int conversion
294    __m128i vResulti = _mm_cvttps_epi32(vResult);
295    // If there was positive overflow, set to 0x7FFFFFFF
296    vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
297    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
298    vOverflow = _mm_or_ps(vOverflow,vResult);
299    return vOverflow;
300#endif
301}
302
303//------------------------------------------------------------------------------
304
305XMINLINE XMVECTOR XMConvertVectorUIntToFloat
306(
307    FXMVECTOR VUInt,
308    UINT      DivExponent
309)
310{
311#if defined(_XM_NO_INTRINSICS_)
312    UINT ElementIndex;
313    FLOAT fScale;
314    XMVECTOR Result;
315    XMASSERT(DivExponent<32);
316    fScale = 1.0f / (FLOAT)(1U << DivExponent);
317    ElementIndex = 0;
318    do {
319        Result.vector4_f32[ElementIndex] = (FLOAT)VUInt.vector4_u32[ElementIndex] * fScale;
320    } while (++ElementIndex<4);
321    return Result;
322#else // _XM_SSE_INTRINSICS_
323    XMASSERT(DivExponent<32);
324    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
325    // Determine which ones need the fix.
326    XMVECTOR vMask = _mm_and_ps(VUInt,g_XMNegativeZero);
327    // Force all values positive
328    XMVECTOR vResult = _mm_xor_ps(VUInt,vMask);
329    // Convert to floats
330    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
331    // Convert 0x80000000 -> 0xFFFFFFFF
332    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
333    // For only the ones that are too big, add the fixup
334    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
335    vResult = _mm_add_ps(vResult,vMask);
336    // Convert DivExponent into 1.0f/(1<<DivExponent)
337    UINT uScale = 0x3F800000U - (DivExponent << 23);
338    // Splat
339    iMask = _mm_set1_epi32(uScale);
340    vResult = _mm_mul_ps(vResult,reinterpret_cast<const __m128 *>(&iMask)[0]);
341    return vResult;
342#endif
343}
344
345//------------------------------------------------------------------------------
346
347XMINLINE XMVECTOR XMConvertVectorFloatToUInt
348(
349    FXMVECTOR VFloat,
350    UINT      MulExponent
351)
352{
353#if defined(_XM_NO_INTRINSICS_)
354    UINT ElementIndex;
355    XMVECTOR Result;
356    FLOAT fScale;
357    XMASSERT(MulExponent<32);
358    // Get the scalar factor.
359    fScale = (FLOAT)(1U << MulExponent);
360    ElementIndex = 0;
361    do {
362        UINT uResult;
363        FLOAT fTemp = VFloat.vector4_f32[ElementIndex]*fScale;
364        if (fTemp <= 0.0f) {
365            uResult = 0;
366        } else if (fTemp >= (65536.0f*65536.0f)) {
367            uResult = 0xFFFFFFFFU;
368        } else {
369            uResult = (UINT)fTemp;
370        }
371        Result.vector4_u32[ElementIndex] = uResult;
372    } while (++ElementIndex<4);
373    return Result;
374#else // _XM_SSE_INTRINSICS_
375    XMASSERT(MulExponent<32);
376    XMVECTOR vResult = _mm_set_ps1(static_cast<float>(1U << MulExponent));
377    vResult = _mm_mul_ps(vResult,VFloat);
378    // Clamp to >=0
379    vResult = _mm_max_ps(vResult,g_XMZero);
380    // Any numbers that are too big, set to 0xFFFFFFFFU
381    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
382    XMVECTOR vValue = g_XMUnsignedFix;
383    // Too large for a signed integer?
384    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
385    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
386    vValue = _mm_and_ps(vValue,vMask);
387    // Perform fixup only on numbers too large (Keeps low bit precision)
388    vResult = _mm_sub_ps(vResult,vValue);
389    __m128i vResulti = _mm_cvttps_epi32(vResult);
390    // Convert from signed to unsigned pnly if greater than 0x80000000
391    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
392    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
393    // On those that are too large, set to 0xFFFFFFFF
394    vResult = _mm_or_ps(vResult,vOverflow);
395    return vResult;
396#endif
397}
398
399#pragma warning(pop)
400
401#endif // _XM_NO_INTRINSICS_ || _XM_SSE_INTRINSICS_
402
403/****************************************************************************
404 *
405 * Vector and matrix load operations
406 *
407 ****************************************************************************/
408
409//------------------------------------------------------------------------------
410
411XMFINLINE XMVECTOR XMLoadInt(CONST UINT* pSource)
412{
413#if defined(_XM_NO_INTRINSICS_)
414
415    XMVECTOR V;
416    XMASSERT(pSource);
417    XMASSERT(((UINT_PTR)pSource & 3) == 0);
418
419    V.vector4_u32[0] = *pSource;
420
421    return V;
422
423#elif defined(_XM_SSE_INTRINSICS_)
424    XMASSERT(pSource);
425    XMASSERT(((UINT_PTR)pSource & 3) == 0);
426
427    return _mm_load_ss( (const float*)pSource );
428#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
429#endif // _XM_VMX128_INTRINSICS_
430}
431
432//------------------------------------------------------------------------------
433
434XMFINLINE XMVECTOR XMLoadFloat(CONST FLOAT* pSource)
435{
436#if defined(_XM_NO_INTRINSICS_)
437
438    XMVECTOR V;
439    XMASSERT(pSource);
440    XMASSERT(((UINT_PTR)pSource & 3) == 0);
441
442    V.vector4_f32[0] = *pSource;
443
444    return V;
445
446#elif defined(_XM_SSE_INTRINSICS_)
447    XMASSERT(pSource);
448    XMASSERT(((UINT_PTR)pSource & 3) == 0);
449
450    return _mm_load_ss( pSource );
451#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
452#endif // _XM_VMX128_INTRINSICS_
453}
454
455//------------------------------------------------------------------------------
456
457XMFINLINE XMVECTOR XMLoadInt2
458(
459    CONST UINT* pSource
460)
461{
462#if defined(_XM_NO_INTRINSICS_)
463
464    XMVECTOR V;
465
466    XMASSERT(pSource);
467
468    V.vector4_u32[0] = pSource[0];
469    V.vector4_u32[1] = pSource[1];
470
471    return V;
472#elif defined(_XM_SSE_INTRINSICS_)
473
474    XMASSERT(pSource);
475
476    __m128 x = _mm_load_ss( (const float*)pSource );
477    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
478    return _mm_unpacklo_ps( x, y );
479#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
480#endif // _XM_VMX128_INTRINSICS_
481}
482
483//------------------------------------------------------------------------------
484
485XMFINLINE XMVECTOR XMLoadSInt2
486(
487    CONST XMINT2* pSource
488)
489{
490#if defined(_XM_NO_INTRINSICS_)
491    XMVECTOR V;
492    XMASSERT(pSource);
493
494    V.vector4_f32[0] = (float)pSource->x;
495    V.vector4_f32[1] = (float)pSource->y;
496    return V;
497
498#elif defined(_XM_SSE_INTRINSICS_)
499    XMASSERT(pSource);
500
501    __m128 x = _mm_load_ss( (const float*)&pSource->x );
502    __m128 y = _mm_load_ss( (const float*)&pSource->y );
503    __m128 V = _mm_unpacklo_ps( x, y );
504    return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
505#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
506#endif // _XM_VMX128_INTRINSICS_
507}
508
509//------------------------------------------------------------------------------
510
511XMFINLINE XMVECTOR XMLoadUInt2
512(
513    CONST XMUINT2* pSource
514)
515{
516#if defined(_XM_NO_INTRINSICS_)
517    XMVECTOR V;
518    XMASSERT(pSource);
519
520    V.vector4_f32[0] = (float)pSource->x;
521    V.vector4_f32[1] = (float)pSource->y;
522    return V;
523
524#elif defined(_XM_SSE_INTRINSICS_)
525    XMASSERT(pSource);
526
527    __m128 x = _mm_load_ss( (const float*)&pSource->x );
528    __m128 y = _mm_load_ss( (const float*)&pSource->y );
529    __m128 V = _mm_unpacklo_ps( x, y );
530    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
531    // Determine which ones need the fix.
532    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
533    // Force all values positive
534    XMVECTOR vResult = _mm_xor_ps(V,vMask);
535    // Convert to floats
536    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
537    // Convert 0x80000000 -> 0xFFFFFFFF
538    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
539    // For only the ones that are too big, add the fixup
540    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
541    vResult = _mm_add_ps(vResult,vMask);
542    return vResult;
543#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
544#endif // _XM_VMX128_INTRINSICS_
545}
546
547//------------------------------------------------------------------------------
548
549XMFINLINE XMVECTOR XMLoadInt2A
550(
551    CONST UINT* pSource
552)
553{
554#if defined(_XM_NO_INTRINSICS_)
555
556    XMVECTOR V;
557
558    XMASSERT(pSource);
559    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
560
561    V.vector4_u32[0] = pSource[0];
562    V.vector4_u32[1] = pSource[1];
563
564    return V;
565
566#elif defined(_XM_SSE_INTRINSICS_)
567
568    XMASSERT(pSource);
569    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
570
571    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
572    return reinterpret_cast<__m128 *>(&V)[0];
573
574#else // _XM_VMX128_INTRINSICS_
575#endif // _XM_VMX128_INTRINSICS_
576}
577
578//------------------------------------------------------------------------------
579
580XMFINLINE XMVECTOR XMLoadFloat2
581(
582    CONST XMFLOAT2* pSource
583)
584{
585#if defined(_XM_NO_INTRINSICS_)
586    XMVECTOR V;
587    XMASSERT(pSource);
588
589    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
590    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
591    return V;
592#elif defined(_XM_SSE_INTRINSICS_)
593    XMASSERT(pSource);
594
595    __m128 x = _mm_load_ss( &pSource->x );
596    __m128 y = _mm_load_ss( &pSource->y );
597    return _mm_unpacklo_ps( x, y );
598#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
599#endif // _XM_VMX128_INTRINSICS_
600}
601
602//------------------------------------------------------------------------------
603
604XMFINLINE XMVECTOR XMLoadFloat2A
605(
606    CONST XMFLOAT2A* pSource
607)
608{
609#if defined(_XM_NO_INTRINSICS_)
610
611    XMVECTOR V;
612
613    XMASSERT(pSource);
614    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
615
616    V.vector4_f32[0] = pSource->x;
617    V.vector4_f32[1] = pSource->y;
618
619    return V;
620
621#elif defined(_XM_SSE_INTRINSICS_)
622    XMASSERT(pSource);
623    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
624
625    __m128i V = _mm_loadl_epi64( (const __m128i*)pSource );
626    return reinterpret_cast<__m128 *>(&V)[0];
627#else // _XM_VMX128_INTRINSICS_
628#endif // _XM_VMX128_INTRINSICS_
629}
630
631//------------------------------------------------------------------------------
632
633XMFINLINE XMVECTOR XMLoadHalf2
634(
635    CONST XMHALF2* pSource
636)
637{
638#if defined(_XM_NO_INTRINSICS_)
639    XMASSERT(pSource);
640    {
641    XMVECTOR vResult = {
642        XMConvertHalfToFloat(pSource->x),
643        XMConvertHalfToFloat(pSource->y),
644        0.0f,
645        0.0f
646    };
647    return vResult;
648    }
649#elif defined(_XM_SSE_INTRINSICS_)
650    XMASSERT(pSource);
651    XMVECTOR vResult = {
652        XMConvertHalfToFloat(pSource->x),
653        XMConvertHalfToFloat(pSource->y),
654        0.0f,
655        0.0f
656    };
657    return vResult;
658
659#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
660#endif // _XM_VMX128_INTRINSICS_
661}
662
663//------------------------------------------------------------------------------
664
665XMFINLINE XMVECTOR XMLoadShortN2
666(
667    CONST XMSHORTN2* pSource
668)
669{
670#if defined(_XM_NO_INTRINSICS_)
671    XMASSERT(pSource);
672    {
673    XMVECTOR vResult = {
674        (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
675        (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
676        0.0f,
677        0.0f
678    };
679    return vResult;
680    }
681
682#elif defined(_XM_SSE_INTRINSICS_)
683    XMASSERT(pSource);
684    // Splat the two shorts in all four entries (WORD alignment okay,
685    // DWORD alignment preferred)
686    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
687    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
688    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
689    // x needs to be sign extended
690    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
691    // Convert to floating point numbers
692    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
693    // x - 0x8000 to undo the signed order.
694    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
695    // Convert -1.0f - 1.0f
696    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16);
697    // Clamp result (for case of -32768)
698    return _mm_max_ps( vTemp, g_XMNegativeOne );
699#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
700#endif // _XM_VMX128_INTRINSICS_
701}
702
703//------------------------------------------------------------------------------
704
705XMFINLINE XMVECTOR XMLoadShort2
706(
707    CONST XMSHORT2* pSource
708)
709{
710#if defined(_XM_NO_INTRINSICS_)
711
712    XMVECTOR V;
713
714    XMASSERT(pSource);
715
716    V.vector4_f32[0] = (FLOAT)pSource->x;
717    V.vector4_f32[1] = (FLOAT)pSource->y;
718
719    return V;
720
721#elif defined(_XM_SSE_INTRINSICS_)
722    XMASSERT(pSource);
723    // Splat the two shorts in all four entries (WORD alignment okay,
724    // DWORD alignment preferred)
725    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
726    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
727    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
728    // x needs to be sign extended
729    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16);
730    // Convert to floating point numbers
731    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
732    // x - 0x8000 to undo the signed order.
733    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16);
734    // Y is 65536 too large
735    return _mm_mul_ps(vTemp,g_XMFixupY16);
736#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
737#endif // _XM_VMX128_INTRINSICS_
738}
739
740//------------------------------------------------------------------------------
741
742XMFINLINE XMVECTOR XMLoadUShortN2
743(
744    CONST XMUSHORTN2* pSource
745)
746{
747#if defined(_XM_NO_INTRINSICS_)
748
749    XMVECTOR V;
750
751    XMASSERT(pSource);
752
753    V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
754    V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
755
756    return V;
757
758#elif defined(_XM_SSE_INTRINSICS_)
759    static const XMVECTORF32 FixupY16 = {1.0f/65535.0f,1.0f/(65535.0f*65536.0f),0.0f,0.0f};
760    static const XMVECTORF32 FixaddY16 = {0,32768.0f*65536.0f,0,0};
761    XMASSERT(pSource);
762    // Splat the two shorts in all four entries (WORD alignment okay,
763    // DWORD alignment preferred)
764    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
765    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
766    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
767    // y needs to be sign flipped
768    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
769    // Convert to floating point numbers
770    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
771    // y + 0x8000 to undo the signed order.
772    vTemp = _mm_add_ps(vTemp,FixaddY16);
773    // Y is 65536 times too large
774    vTemp = _mm_mul_ps(vTemp,FixupY16);
775    return vTemp;
776#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
777#endif // _XM_VMX128_INTRINSICS_
778}
779
780//------------------------------------------------------------------------------
781
782XMFINLINE XMVECTOR XMLoadUShort2
783(
784    CONST XMUSHORT2* pSource
785)
786{
787#if defined(_XM_NO_INTRINSICS_)
788
789    XMVECTOR V;
790
791    XMASSERT(pSource);
792
793    V.vector4_f32[0] = (FLOAT)pSource->x;
794    V.vector4_f32[1] = (FLOAT)pSource->y;
795
796    return V;
797
798#elif defined(_XM_SSE_INTRINSICS_)
799    static const XMVECTORF32 FixaddY16 = {0,32768.0f,0,0};
800    XMASSERT(pSource);
801    // Splat the two shorts in all four entries (WORD alignment okay,
802    // DWORD alignment preferred)
803    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->x));
804    // Mask x&0xFFFF, y&0xFFFF0000,z&0,w&0
805    vTemp = _mm_and_ps(vTemp,g_XMMaskX16Y16);
806    // y needs to be sign flipped
807    vTemp = _mm_xor_ps(vTemp,g_XMFlipY);
808    // Convert to floating point numbers
809    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
810    // Y is 65536 times too large
811    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16);
812    // y + 0x8000 to undo the signed order.
813    vTemp = _mm_add_ps(vTemp,FixaddY16);
814    return vTemp;
815#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
816#endif // _XM_VMX128_INTRINSICS_
817}
818
819//------------------------------------------------------------------------------
820
821XMFINLINE XMVECTOR XMLoadByteN2
822(
823    CONST XMBYTEN2* pSource
824)
825{
826    XMASSERT(pSource);
827    {
828    XMVECTOR vResult = {
829        (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x * (1.0f/127.0f)),
830        (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y * (1.0f/127.0f)),
831        0.0f,
832        0.0f
833    };
834    return vResult;
835    }
836}
837
838//------------------------------------------------------------------------------
839
840XMFINLINE XMVECTOR XMLoadByte2
841(
842    CONST XMBYTE2* pSource
843)
844{
845    XMASSERT(pSource);
846    {
847    XMVECTOR vResult = {
848        (FLOAT)pSource->x,
849        (FLOAT)pSource->y,
850        0.0f,
851        0.0f
852    };
853    return vResult;
854    }
855}
856
857//------------------------------------------------------------------------------
858
859XMFINLINE XMVECTOR XMLoadUByteN2
860(
861    CONST XMUBYTEN2* pSource
862)
863{
864    XMASSERT(pSource);
865    {
866    XMVECTOR vResult = {
867        (FLOAT)pSource->x * (1.0f/255.0f),
868        (FLOAT)pSource->y * (1.0f/255.0f),
869        0.0f,
870        0.0f
871    };
872    return vResult;
873    }
874}
875
876//------------------------------------------------------------------------------
877
878XMFINLINE XMVECTOR XMLoadUByte2
879(
880    CONST XMUBYTE2* pSource
881)
882{
883    XMASSERT(pSource);
884    {
885    XMVECTOR vResult = {
886        (FLOAT)pSource->x,
887        (FLOAT)pSource->y,
888        0.0f,
889        0.0f
890    };
891    return vResult;
892    }
893}
894
895//------------------------------------------------------------------------------
896
897XMFINLINE XMVECTOR XMLoadInt3
898(
899    CONST UINT* pSource
900)
901{
902#if defined(_XM_NO_INTRINSICS_)
903
904    XMVECTOR V;
905
906    XMASSERT(pSource);
907
908    V.vector4_u32[0] = pSource[0];
909    V.vector4_u32[1] = pSource[1];
910    V.vector4_u32[2] = pSource[2];
911
912    return V;
913
914#elif defined(_XM_SSE_INTRINSICS_)
915    XMASSERT(pSource);
916
917#ifdef _XM_ISVS2005_
918    __m128i V = _mm_set_epi32( 0, *(pSource+2), *(pSource+1), *pSource );
919    return reinterpret_cast<__m128 *>(&V)[0];
920#else
921    __m128 x = _mm_load_ss( (const float*)pSource );
922    __m128 y = _mm_load_ss( (const float*)(pSource+1) );
923    __m128 z = _mm_load_ss( (const float*)(pSource+2) );
924    __m128 xy = _mm_unpacklo_ps( x, y );
925    return _mm_movelh_ps( xy, z );
926#endif // !_XM_ISVS2005_
927#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
928#endif // _XM_VMX128_INTRINSICS_
929}
930
931//------------------------------------------------------------------------------
932
933XMFINLINE XMVECTOR XMLoadSInt3
934(
935    CONST XMINT3* pSource
936)
937{
938#if defined(_XM_NO_INTRINSICS_)
939    XMVECTOR V;
940    XMASSERT(pSource);
941
942#ifdef _XBOX_VER
943    V = XMLoadInt3( (const UINT*)pSource );
944    return XMConvertVectorIntToFloat( V, 0 );
945#else
946    V.vector4_f32[0] = (float)pSource->x;
947    V.vector4_f32[1] = (float)pSource->y;
948    V.vector4_f32[2] = (float)pSource->z;
949    return V;
950#endif
951
952#elif defined(_XM_SSE_INTRINSICS_)
953    XMASSERT(pSource);
954
955#ifdef _XM_ISVS2005_
956    __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
957    return _mm_cvtepi32_ps(V);
958#else
959    __m128 x = _mm_load_ss( (const float*)&pSource->x );
960    __m128 y = _mm_load_ss( (const float*)&pSource->y );
961    __m128 z = _mm_load_ss( (const float*)&pSource->z );
962    __m128 xy = _mm_unpacklo_ps( x, y );
963    __m128 V = _mm_movelh_ps( xy, z );
964    return _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&V)[0]);
965#endif // !_XM_ISVS2005_
966#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
967#endif // _XM_VMX128_INTRINSICS_
968}
969
970//------------------------------------------------------------------------------
971
972XMFINLINE XMVECTOR XMLoadUInt3
973(
974    CONST XMUINT3* pSource
975)
976{
977#if defined(_XM_NO_INTRINSICS_)
978    XMVECTOR V;
979    XMASSERT(pSource);
980
981    V.vector4_f32[0] = (float)pSource->x;
982    V.vector4_f32[1] = (float)pSource->y;
983    V.vector4_f32[2] = (float)pSource->z;
984    return V;
985
986#elif defined(_XM_SSE_INTRINSICS_)
987    XMASSERT(pSource);
988
989#ifdef _XM_ISVS2005_
990    __m128i V = _mm_set_epi32( 0, pSource->z, pSource->y, pSource->x );
991    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
992    // Determine which ones need the fix.
993    XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
994    // Force all values positive
995    XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
996#else
997    __m128 x = _mm_load_ss( (const float*)&pSource->x );
998    __m128 y = _mm_load_ss( (const float*)&pSource->y );
999    __m128 z = _mm_load_ss( (const float*)&pSource->z );
1000    __m128 xy = _mm_unpacklo_ps( x, y );
1001    __m128 V = _mm_movelh_ps( xy, z );
1002    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
1003    // Determine which ones need the fix.
1004    XMVECTOR vMask = _mm_and_ps(V,g_XMNegativeZero);
1005    // Force all values positive
1006    XMVECTOR vResult = _mm_xor_ps(V,vMask);
1007#endif // !_XM_ISVS2005_
1008    // Convert to floats
1009    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1010    // Convert 0x80000000 -> 0xFFFFFFFF
1011    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
1012    // For only the ones that are too big, add the fixup
1013    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
1014    vResult = _mm_add_ps(vResult,vMask);
1015    return vResult;
1016
1017#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1018#endif // _XM_VMX128_INTRINSICS_
1019}
1020
1021//------------------------------------------------------------------------------
1022
1023XMFINLINE XMVECTOR XMLoadInt3A
1024(
1025    CONST UINT* pSource
1026)
1027{
1028#if defined(_XM_NO_INTRINSICS_)
1029
1030    XMVECTOR V;
1031
1032    XMASSERT(pSource);
1033    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1034
1035    V.vector4_u32[0] = pSource[0];
1036    V.vector4_u32[1] = pSource[1];
1037    V.vector4_u32[2] = pSource[2];
1038
1039    return V;
1040
1041#elif defined(_XM_SSE_INTRINSICS_)
1042    XMASSERT(pSource);
1043
1044    // Reads an extra integer that is 'undefined'
1045
1046    __m128i V = _mm_load_si128( (const __m128i*)pSource );
1047    return reinterpret_cast<__m128 *>(&V)[0];
1048#else // _XM_VMX128_INTRINSICS_
1049#endif // _XM_VMX128_INTRINSICS_
1050}
1051
1052//------------------------------------------------------------------------------
1053
1054XMFINLINE XMVECTOR XMLoadFloat3
1055(
1056    CONST XMFLOAT3* pSource
1057)
1058{
1059#if defined(_XM_NO_INTRINSICS_)
1060    XMVECTOR V;
1061    XMASSERT(pSource);
1062
1063    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
1064    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
1065    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
1066    return V;
1067#elif defined(_XM_SSE_INTRINSICS_)
1068    XMASSERT(pSource);
1069
1070#ifdef _XM_ISVS2005_
1071    // This reads 1 floats past the memory that should be ignored.
1072    // Need to continue to do this for VS 2005 due to compiler issue but prefer new method
1073    // to avoid triggering issues with memory debug tools (like AV)
1074    return _mm_loadu_ps( &pSource->x );
1075#else
1076    __m128 x = _mm_load_ss( &pSource->x );
1077    __m128 y = _mm_load_ss( &pSource->y );
1078    __m128 z = _mm_load_ss( &pSource->z );
1079    __m128 xy = _mm_unpacklo_ps( x, y );
1080    return _mm_movelh_ps( xy, z );
1081#endif // !_XM_ISVS2005_
1082#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1083#endif // _XM_VMX128_INTRINSICS_
1084}
1085
1086//------------------------------------------------------------------------------
1087
1088XMFINLINE XMVECTOR XMLoadFloat3A
1089(
1090    CONST XMFLOAT3A* pSource
1091)
1092{
1093#if defined(_XM_NO_INTRINSICS_)
1094
1095    XMVECTOR V;
1096
1097    XMASSERT(pSource);
1098    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1099
1100    V.vector4_f32[0] = pSource->x;
1101    V.vector4_f32[1] = pSource->y;
1102    V.vector4_f32[2] = pSource->z;
1103
1104    return V;
1105
1106#elif defined(_XM_SSE_INTRINSICS_)
1107    XMASSERT(pSource);
1108    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1109
1110    // This reads 1 floats past the memory that should be ignored.
1111    return _mm_load_ps( &pSource->x );
1112#else // _XM_VMX128_INTRINSICS_
1113#endif // _XM_VMX128_INTRINSICS_
1114}
1115
1116//------------------------------------------------------------------------------
1117
1118XMFINLINE XMVECTOR XMLoadUHenDN3
1119(
1120    CONST XMUHENDN3* pSource
1121)
1122{
1123#if defined(_XM_NO_INTRINSICS_)
1124
1125    XMVECTOR          V;
1126    UINT              Element;
1127
1128    XMASSERT(pSource);
1129
1130    Element = pSource->v & 0x7FF;
1131    V.vector4_f32[0] = (FLOAT)Element / 2047.0f;
1132    Element = (pSource->v >> 11) & 0x7FF;
1133    V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
1134    Element = (pSource->v >> 22) & 0x3FF;
1135    V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
1136
1137    return V;
1138
1139#elif defined(_XM_SSE_INTRINSICS_)
1140    static const XMVECTORF32 UHenDN3Mul = {1.0f/2047.0f,1.0f/(2047.0f*2048.0f),1.0f/(1023.0f*2048.0f*2048.0f),0};
1141    XMASSERT(pSource);
1142    // Get the 32 bit value and splat it
1143    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1144    // Mask off x, y and z
1145    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1146    // Convert x and y to unsigned
1147    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1148    // Convert to float
1149    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1150    // Convert x and y back to signed
1151    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1152    // Normalize x,y and z to -1.0f-1.0f
1153    vResult = _mm_mul_ps(vResult,UHenDN3Mul);
1154    return vResult;
1155#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1156#endif // _XM_VMX128_INTRINSICS_
1157}
1158
1159//------------------------------------------------------------------------------
1160
1161XMFINLINE XMVECTOR XMLoadUHenD3
1162(
1163    CONST XMUHEND3* pSource
1164)
1165{
1166#if defined(_XM_NO_INTRINSICS_)
1167
1168    XMVECTOR          V;
1169    UINT              Element;
1170
1171    XMASSERT(pSource);
1172
1173    Element = pSource->v & 0x7FF;
1174    V.vector4_f32[0] = (FLOAT)Element;
1175    Element = (pSource->v >> 11) & 0x7FF;
1176    V.vector4_f32[1] = (FLOAT)Element;
1177    Element = (pSource->v >> 22) & 0x3FF;
1178    V.vector4_f32[2] = (FLOAT)Element;
1179
1180    return V;
1181
1182#elif defined(_XM_SSE_INTRINSICS_)
1183    XMASSERT(pSource);
1184    // Get the 32 bit value and splat it
1185    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1186    // Mask off x, y and z
1187    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1188    // Convert x and y to unsigned
1189    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1190    // Convert to float
1191    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1192    // Convert x and y back to signed
1193    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1194    // Normalize x and y to -1024-1023.0f and z to -512-511.0f
1195    vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
1196    return vResult;
1197#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1198#endif // _XM_VMX128_INTRINSICS_
1199}
1200
1201//------------------------------------------------------------------------------
1202
1203XMFINLINE XMVECTOR XMLoadHenDN3
1204(
1205    CONST XMHENDN3* pSource
1206)
1207{
1208#if defined(_XM_NO_INTRINSICS_)
1209
1210    XMVECTOR          V;
1211    UINT              Element;
1212    static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
1213    static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
1214
1215    XMASSERT(pSource);
1216    XMASSERT((pSource->v & 0x7FF) != 0x400);
1217    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1218    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1219
1220    Element = pSource->v & 0x7FF;
1221    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
1222    Element = (pSource->v >> 11) & 0x7FF;
1223    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]) / 1023.0f;
1224    Element = (pSource->v >> 22) & 0x3FF;
1225    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]) / 511.0f;
1226
1227    return V;
1228
1229#elif defined(_XM_SSE_INTRINSICS_)
1230    static const XMVECTORF32 HenDN3Mul = {1.0f/1023.0f,1.0f/(1023.0f*2048.0f),1.0f/(511.0f*2048.0f*2048.0f),0};
1231    XMASSERT(pSource);
1232    XMASSERT((pSource->v & 0x7FF) != 0x400);
1233    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1234    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1235    // Get the 32 bit value and splat it
1236    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1237    // Mask off x, y and z
1238    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1239    // Convert x and y to unsigned
1240    vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1241    // Convert to float
1242    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1243    // Convert x and y back to signed
1244    vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1245    // Normalize x,y and z to -1.0f-1.0f
1246    vResult = _mm_mul_ps(vResult,HenDN3Mul);
1247    return vResult;
1248#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1249#endif // _XM_VMX128_INTRINSICS_
1250}
1251
1252//------------------------------------------------------------------------------
1253
1254XMFINLINE XMVECTOR XMLoadHenD3
1255(
1256    CONST XMHEND3* pSource
1257)
1258{
1259#if defined(_XM_NO_INTRINSICS_)
1260
1261    XMVECTOR          V;
1262    UINT              Element;
1263    static CONST UINT SignExtendXY[] = {0x00000000, 0xFFFFF800};
1264    static CONST UINT SignExtendZ[] = {0x00000000, 0xFFFFFC00};
1265
1266    XMASSERT(pSource);
1267    XMASSERT((pSource->v & 0x7FF) != 0x400);
1268    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1269    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1270
1271    Element = pSource->v & 0x7FF;
1272    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1273    Element = (pSource->v >> 11) & 0x7FF;
1274    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendXY[Element >> 10]);
1275    Element = (pSource->v >> 22) & 0x3FF;
1276    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendZ[Element >> 9]);
1277
1278    return V;
1279
1280#elif defined(_XM_SSE_INTRINSICS_)
1281    XMASSERT(pSource);
1282    XMASSERT((pSource->v & 0x7FF) != 0x400);
1283    XMASSERT(((pSource->v >> 11) & 0x7FF) != 0x400);
1284    XMASSERT(((pSource->v >> 22) & 0x3FF) != 0x200);
1285    // Get the 32 bit value and splat it
1286    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1287    // Mask off x, y and z
1288    vResult = _mm_and_ps(vResult,g_XMMaskHenD3);
1289    // Convert x and y to unsigned
1290    vResult = _mm_xor_ps(vResult,g_XMXorHenD3);
1291    // Convert to float
1292    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1293    // Convert x and y back to signed
1294    vResult = _mm_add_ps(vResult,g_XMAddHenD3);
1295    // Normalize x and y to -1024-1023.0f and z to -512-511.0f
1296    vResult = _mm_mul_ps(vResult,g_XMMulHenD3);
1297    return vResult;
1298#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1299#endif // _XM_VMX128_INTRINSICS_
1300}
1301
1302//------------------------------------------------------------------------------
1303
1304XMFINLINE XMVECTOR XMLoadUDHenN3
1305(
1306    CONST XMUDHENN3* pSource
1307)
1308{
1309#if defined(_XM_NO_INTRINSICS_)
1310
1311    XMVECTOR          V;
1312    UINT              Element;
1313
1314    XMASSERT(pSource);
1315
1316    Element = pSource->v & 0x3FF;
1317    V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
1318    Element = (pSource->v >> 10) & 0x7FF;
1319    V.vector4_f32[1] = (FLOAT)Element / 2047.0f;
1320    Element = (pSource->v >> 21) & 0x7FF;
1321    V.vector4_f32[2] = (FLOAT)Element / 2047.0f;
1322
1323    return V;
1324
1325#elif defined(_XM_SSE_INTRINSICS_)
1326    static const XMVECTORF32 UDHenN3Mul = {1.0f/1023.0f,1.0f/(2047.0f*1024.0f),1.0f/(2047.0f*1024.0f*2048.0f),0};
1327    XMASSERT(pSource);
1328    // Get the 32 bit value and splat it
1329    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1330    // Mask off x, y and z
1331    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1332    // Convert x and y to unsigned
1333    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1334    // Convert to float
1335    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1336    // Convert x and y back to signed
1337    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1338    // Normalize x,y and z to -1.0f-1.0f
1339    vResult = _mm_mul_ps(vResult,UDHenN3Mul);
1340    return vResult;
1341#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1342#endif // _XM_VMX128_INTRINSICS_
1343}
1344
1345//------------------------------------------------------------------------------
1346
1347XMFINLINE XMVECTOR XMLoadUDHen3
1348(
1349    CONST XMUDHEN3* pSource
1350)
1351{
1352#if defined(_XM_NO_INTRINSICS_)
1353
1354    XMVECTOR          V;
1355    UINT              Element;
1356
1357    XMASSERT(pSource);
1358
1359    Element = pSource->v & 0x3FF;
1360    V.vector4_f32[0] = (FLOAT)Element;
1361    Element = (pSource->v >> 10) & 0x7FF;
1362    V.vector4_f32[1] = (FLOAT)Element;
1363    Element = (pSource->v >> 21) & 0x7FF;
1364    V.vector4_f32[2] = (FLOAT)Element;
1365
1366    return V;
1367
1368#elif defined(_XM_SSE_INTRINSICS_)
1369    XMASSERT(pSource);
1370    // Get the 32 bit value and splat it
1371    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1372    // Mask off x, y and z
1373    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1374    // Convert x and y to unsigned
1375    vResult = _mm_xor_ps(vResult,g_XMFlipZ);
1376    // Convert to float
1377    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1378    // Convert x and y back to signed
1379    vResult = _mm_add_ps(vResult,g_XMAddUHenD3);
1380    // Normalize x to 0-1023.0f and y and z to 0-2047.0f
1381    vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1382    return vResult;
1383#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1384#endif // _XM_VMX128_INTRINSICS_
1385}
1386
1387//------------------------------------------------------------------------------
1388
1389XMFINLINE XMVECTOR XMLoadDHenN3
1390(
1391    CONST XMDHENN3* pSource
1392)
1393{
1394#if defined(_XM_NO_INTRINSICS_)
1395
1396    XMVECTOR          V;
1397    UINT              Element;
1398    static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1399    static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1400
1401    XMASSERT(pSource);
1402    XMASSERT((pSource->v & 0x3FF) != 0x200);
1403    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1404    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1405
1406    Element = pSource->v & 0x3FF;
1407    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]) / 511.0f;
1408    Element = (pSource->v >> 10) & 0x7FF;
1409    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1410    Element = (pSource->v >> 21) & 0x7FF;
1411    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]) / 1023.0f;
1412
1413    return V;
1414
1415#elif defined(_XM_SSE_INTRINSICS_)
1416    static const XMVECTORF32 DHenN3Mul = {1.0f/511.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*2048.0f),0};
1417    XMASSERT(pSource);
1418    XMASSERT((pSource->v & 0x3FF) != 0x200);
1419    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1420    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1421    // Get the 32 bit value and splat it
1422    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1423    // Mask off x, y and z
1424    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1425    // Convert x and y to unsigned
1426    vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1427    // Convert to float
1428    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1429    // Convert x and y back to signed
1430    vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1431    // Normalize x,y and z to -1.0f-1.0f
1432    vResult = _mm_mul_ps(vResult,DHenN3Mul);
1433    return vResult;
1434#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1435#endif // _XM_VMX128_INTRINSICS_
1436}
1437
1438//------------------------------------------------------------------------------
1439
1440XMFINLINE XMVECTOR XMLoadDHen3
1441(
1442    CONST XMDHEN3* pSource
1443)
1444{
1445#if defined(_XM_NO_INTRINSICS_)
1446
1447    XMVECTOR          V;
1448    UINT              Element;
1449    static CONST UINT SignExtendX[] = {0x00000000, 0xFFFFFC00};
1450    static CONST UINT SignExtendYZ[] = {0x00000000, 0xFFFFF800};
1451
1452    XMASSERT(pSource);
1453    XMASSERT((pSource->v & 0x3FF) != 0x200);
1454    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1455    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1456
1457    Element = pSource->v & 0x3FF;
1458    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtendX[Element >> 9]);
1459    Element = (pSource->v >> 10) & 0x7FF;
1460    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1461    Element = (pSource->v >> 21) & 0x7FF;
1462    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtendYZ[Element >> 10]);
1463
1464    return V;
1465
1466#elif defined(_XM_SSE_INTRINSICS_)
1467    XMASSERT(pSource);
1468    XMASSERT((pSource->v & 0x3FF) != 0x200);
1469    XMASSERT(((pSource->v >> 10) & 0x7FF) != 0x400);
1470    XMASSERT(((pSource->v >> 21) & 0x7FF) != 0x400);
1471    // Get the 32 bit value and splat it
1472    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1473    // Mask off x, y and z
1474    vResult = _mm_and_ps(vResult,g_XMMaskDHen3);
1475    // Convert x and y to unsigned
1476    vResult = _mm_xor_ps(vResult,g_XMXorDHen3);
1477    // Convert to float
1478    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1479    // Convert x and y back to signed
1480    vResult = _mm_add_ps(vResult,g_XMAddDHen3);
1481    // Normalize x to -210-511.0f and y and z to -1024-1023.0f
1482    vResult = _mm_mul_ps(vResult,g_XMMulDHen3);
1483    return vResult;
1484#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1485#endif // _XM_VMX128_INTRINSICS_
1486}
1487
1488//------------------------------------------------------------------------------
1489
1490XMFINLINE XMVECTOR XMLoadU565
1491(
1492    CONST XMU565* pSource
1493)
1494{
1495#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
1496    static const XMVECTORI32 U565And = {0x1F,0x3F<<5,0x1F<<11,0};
1497    static const XMVECTORF32 U565Mul = {1.0f,1.0f/32.0f,1.0f/2048.f,0};
1498    XMASSERT(pSource);
1499    // Get the 32 bit value and splat it
1500    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
1501    // Mask off x, y and z
1502    vResult = _mm_and_ps(vResult,U565And);
1503    // Convert to float
1504    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1505    // Normalize x, y, and z
1506    vResult = _mm_mul_ps(vResult,U565Mul);
1507    return vResult;
1508#else
1509    XMVECTOR          V;
1510    UINT              Element;
1511
1512    XMASSERT(pSource);
1513
1514    Element = pSource->v & 0x1F;
1515    V.vector4_f32[0] = (FLOAT)Element;
1516    Element = (pSource->v >> 5) & 0x3F;
1517    V.vector4_f32[1] = (FLOAT)Element;
1518    Element = (pSource->v >> 11) & 0x1F;
1519    V.vector4_f32[2] = (FLOAT)Element;
1520
1521    return V;
1522#endif // !_XM_SSE_INTRINSICS_
1523}
1524
1525//------------------------------------------------------------------------------
1526
1527XMFINLINE XMVECTOR XMLoadFloat3PK
1528(
1529    CONST XMFLOAT3PK* pSource
1530)
1531{
1532    _DECLSPEC_ALIGN_16_ UINT Result[4];
1533    UINT Mantissa;
1534    UINT Exponent;
1535
1536    XMASSERT(pSource);
1537
1538    // X Channel (6-bit mantissa)
1539    Mantissa = pSource->xm;
1540
1541    if ( pSource->xe == 0x1f ) // INF or NAN
1542    {
1543        Result[0] = 0x7f800000 | (pSource->xm << 17);
1544    }
1545    else
1546    {
1547        if ( pSource->xe != 0 ) // The value is normalized
1548        {
1549            Exponent = pSource->xe;
1550        }
1551        else if (Mantissa != 0) // The value is denormalized
1552        {
1553            // Normalize the value in the resulting float
1554            Exponent = 1;
1555
1556            do
1557            {
1558                Exponent--;
1559                Mantissa <<= 1;
1560            } while ((Mantissa & 0x40) == 0);
1561
1562            Mantissa &= 0x3F;
1563        }
1564        else // The value is zero
1565        {
1566            Exponent = (UINT)-112;
1567        }
1568
1569        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 17);
1570    }
1571
1572    // Y Channel (6-bit mantissa)
1573    Mantissa = pSource->ym;
1574
1575    if ( pSource->ye == 0x1f ) // INF or NAN
1576    {
1577        Result[1] = 0x7f800000 | (pSource->ym << 17);
1578    }
1579    else
1580    {
1581        if ( pSource->ye != 0 ) // The value is normalized
1582        {
1583            Exponent = pSource->ye;
1584        }
1585        else if (Mantissa != 0) // The value is denormalized
1586        {
1587            // Normalize the value in the resulting float
1588            Exponent = 1;
1589
1590            do
1591            {
1592                Exponent--;
1593                Mantissa <<= 1;
1594            } while ((Mantissa & 0x40) == 0);
1595
1596            Mantissa &= 0x3F;
1597        }
1598        else // The value is zero
1599        {
1600            Exponent = (UINT)-112;
1601        }
1602
1603        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 17);
1604    }
1605
1606    // Z Channel (5-bit mantissa)
1607    Mantissa = pSource->zm;
1608
1609    if ( pSource->ze == 0x1f ) // INF or NAN
1610    {
1611        Result[2] = 0x7f800000 | (pSource->zm << 17);
1612    }
1613    else
1614    {
1615        if ( pSource->ze != 0 ) // The value is normalized
1616        {
1617            Exponent = pSource->ze;
1618        }
1619        else if (Mantissa != 0) // The value is denormalized
1620        {
1621            // Normalize the value in the resulting float
1622            Exponent = 1;
1623
1624            do
1625            {
1626                Exponent--;
1627                Mantissa <<= 1;
1628            } while ((Mantissa & 0x20) == 0);
1629
1630            Mantissa &= 0x1F;
1631        }
1632        else // The value is zero
1633        {
1634            Exponent = (UINT)-112;
1635        }
1636
1637        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 18);
1638    }
1639
1640    return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
1641}
1642
1643//------------------------------------------------------------------------------
1644
1645XMFINLINE XMVECTOR XMLoadFloat3SE
1646(
1647    CONST XMFLOAT3SE* pSource
1648)
1649{
1650    _DECLSPEC_ALIGN_16_ UINT Result[4];
1651    UINT Mantissa;
1652    UINT Exponent, ExpBits;
1653
1654    XMASSERT(pSource);
1655
1656    if ( pSource->e == 0x1f ) // INF or NAN
1657    {
1658        Result[0] = 0x7f800000 | (pSource->xm << 14);
1659        Result[1] = 0x7f800000 | (pSource->ym << 14);
1660        Result[2] = 0x7f800000 | (pSource->zm << 14);
1661    }
1662    else if ( pSource->e != 0 ) // The values are all normalized
1663    {
1664        Exponent = pSource->e;
1665
1666        ExpBits = (Exponent + 112) << 23;
1667
1668        Mantissa = pSource->xm;
1669        Result[0] = ExpBits | (Mantissa << 14);
1670
1671        Mantissa = pSource->ym;
1672        Result[1] = ExpBits | (Mantissa << 14);
1673
1674        Mantissa = pSource->zm;
1675        Result[2] = ExpBits | (Mantissa << 14);
1676    }
1677    else
1678    {
1679        // X Channel
1680        Mantissa = pSource->xm;
1681
1682        if (Mantissa != 0) // The value is denormalized
1683        {
1684            // Normalize the value in the resulting float
1685            Exponent = 1;
1686
1687            do
1688            {
1689                Exponent--;
1690                Mantissa <<= 1;
1691            } while ((Mantissa & 0x200) == 0);
1692
1693            Mantissa &= 0x1FF;
1694        }
1695        else // The value is zero
1696        {
1697            Exponent = (UINT)-112;
1698        }
1699
1700        Result[0] = ((Exponent + 112) << 23) | (Mantissa << 14);
1701
1702        // Y Channel
1703        Mantissa = pSource->ym;
1704
1705        if (Mantissa != 0) // The value is denormalized
1706        {
1707            // Normalize the value in the resulting float
1708            Exponent = 1;
1709
1710            do
1711            {
1712                Exponent--;
1713                Mantissa <<= 1;
1714            } while ((Mantissa & 0x200) == 0);
1715
1716            Mantissa &= 0x1FF;
1717        }
1718        else // The value is zero
1719        {
1720            Exponent = (UINT)-112;
1721        }
1722
1723        Result[1] = ((Exponent + 112) << 23) | (Mantissa << 14);
1724
1725        // Z Channel
1726        Mantissa = pSource->zm;
1727
1728        if (Mantissa != 0) // The value is denormalized
1729        {
1730            // Normalize the value in the resulting float
1731            Exponent = 1;
1732
1733            do
1734            {
1735                Exponent--;
1736                Mantissa <<= 1;
1737            } while ((Mantissa & 0x200) == 0);
1738
1739            Mantissa &= 0x1FF;
1740        }
1741        else // The value is zero
1742        {
1743            Exponent = (UINT)-112;
1744        }
1745
1746        Result[2] = ((Exponent + 112) << 23) | (Mantissa << 14);
1747    }
1748
1749    return XMLoadFloat3A( (const XMFLOAT3A*)&Result );
1750}
1751
1752//------------------------------------------------------------------------------
1753
1754XMFINLINE XMVECTOR XMLoadInt4
1755(
1756    CONST UINT* pSource
1757)
1758{
1759#if defined(_XM_NO_INTRINSICS_)
1760
1761    XMVECTOR V;
1762
1763    XMASSERT(pSource);
1764
1765    V.vector4_u32[0] = pSource[0];
1766    V.vector4_u32[1] = pSource[1];
1767    V.vector4_u32[2] = pSource[2];
1768    V.vector4_u32[3] = pSource[3];
1769
1770    return V;
1771
1772#elif defined(_XM_SSE_INTRINSICS_)
1773
1774    XMASSERT(pSource);
1775
1776    __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1777    return reinterpret_cast<__m128 *>(&V)[0];
1778
1779#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1780#endif // _XM_VMX128_INTRINSICS_
1781}
1782
1783//------------------------------------------------------------------------------
1784
1785XMFINLINE XMVECTOR XMLoadSInt4
1786(
1787    CONST XMINT4* pSource
1788)
1789{
1790#if defined(_XM_NO_INTRINSICS_)
1791    XMVECTOR V;
1792    XMASSERT(pSource);
1793
1794#ifdef _XBOX_VER
1795    V = XMLoadInt4( (const UINT*)pSource );
1796    return XMConvertVectorIntToFloat( V, 0 );
1797#else
1798    V.vector4_f32[0] = (float)pSource->x;
1799    V.vector4_f32[1] = (float)pSource->y;
1800    V.vector4_f32[2] = (float)pSource->z;
1801    V.vector4_f32[3] = (float)pSource->w;
1802    return V;
1803#endif
1804
1805#elif defined(_XM_SSE_INTRINSICS_)
1806    XMASSERT(pSource);
1807    __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1808    return _mm_cvtepi32_ps(V);
1809#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1810#endif // _XM_VMX128_INTRINSICS_
1811}
1812
1813//------------------------------------------------------------------------------
1814
1815XMFINLINE XMVECTOR XMLoadUInt4
1816(
1817    CONST XMUINT4* pSource
1818)
1819{
1820#if defined(_XM_NO_INTRINSICS_)
1821    XMVECTOR V;
1822    XMASSERT(pSource);
1823
1824    V.vector4_f32[0] = (float)pSource->x;
1825    V.vector4_f32[1] = (float)pSource->y;
1826    V.vector4_f32[2] = (float)pSource->z;
1827    V.vector4_f32[3] = (float)pSource->w;
1828    return V;
1829
1830#elif defined(_XM_SSE_INTRINSICS_)
1831    XMASSERT(pSource);
1832    __m128i V = _mm_loadu_si128( (const __m128i*)pSource );
1833    // For the values that are higher than 0x7FFFFFFF, a fixup is needed
1834    // Determine which ones need the fix.
1835    XMVECTOR vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&V)[0],g_XMNegativeZero);
1836    // Force all values positive
1837    XMVECTOR vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&V)[0],vMask);
1838    // Convert to floats
1839    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
1840    // Convert 0x80000000 -> 0xFFFFFFFF
1841    __m128i iMask = _mm_srai_epi32(reinterpret_cast<const __m128i *>(&vMask)[0],31);
1842    // For only the ones that are too big, add the fixup
1843    vMask = _mm_and_ps(reinterpret_cast<const __m128 *>(&iMask)[0],g_XMFixUnsigned);
1844    vResult = _mm_add_ps(vResult,vMask);
1845    return vResult;
1846#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1847#endif // _XM_VMX128_INTRINSICS_
1848}
1849
1850//------------------------------------------------------------------------------
1851
1852XMFINLINE XMVECTOR XMLoadInt4A
1853(
1854    CONST UINT* pSource
1855)
1856{
1857#if defined(_XM_NO_INTRINSICS_)
1858
1859    XMVECTOR V;
1860
1861    XMASSERT(pSource);
1862    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1863
1864    V.vector4_u32[0] = pSource[0];
1865    V.vector4_u32[1] = pSource[1];
1866    V.vector4_u32[2] = pSource[2];
1867    V.vector4_u32[3] = pSource[3];
1868
1869    return V;
1870
1871#elif defined(_XM_SSE_INTRINSICS_)
1872
1873    XMASSERT(pSource);
1874    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1875
1876    __m128i V = _mm_load_si128( (const __m128i*)pSource );
1877    return reinterpret_cast<__m128 *>(&V)[0];
1878
1879#else // _XM_VMX128_INTRINSICS_
1880#endif // _XM_VMX128_INTRINSICS_
1881}
1882
1883//------------------------------------------------------------------------------
1884
1885XMFINLINE XMVECTOR XMLoadFloat4
1886(
1887    CONST XMFLOAT4* pSource
1888)
1889{
1890#if defined(_XM_NO_INTRINSICS_)
1891    XMVECTOR V;
1892    XMASSERT(pSource);
1893
1894    ((UINT *)(&V.vector4_f32[0]))[0] = ((const UINT *)(&pSource->x))[0];
1895    ((UINT *)(&V.vector4_f32[1]))[0] = ((const UINT *)(&pSource->y))[0];
1896    ((UINT *)(&V.vector4_f32[2]))[0] = ((const UINT *)(&pSource->z))[0];
1897    ((UINT *)(&V.vector4_f32[3]))[0] = ((const UINT *)(&pSource->w))[0];
1898    return V;
1899#elif defined(_XM_SSE_INTRINSICS_)
1900    XMASSERT(pSource);
1901
1902    return _mm_loadu_ps( &pSource->x );
1903#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1904#endif // _XM_VMX128_INTRINSICS_
1905}
1906
1907//------------------------------------------------------------------------------
1908
1909XMFINLINE XMVECTOR XMLoadFloat4A
1910(
1911    CONST XMFLOAT4A* pSource
1912)
1913{
1914#if defined(_XM_NO_INTRINSICS_)
1915
1916    XMVECTOR V;
1917
1918    XMASSERT(pSource);
1919    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1920
1921    V.vector4_f32[0] = pSource->x;
1922    V.vector4_f32[1] = pSource->y;
1923    V.vector4_f32[2] = pSource->z;
1924    V.vector4_f32[3] = pSource->w;
1925
1926    return V;
1927
1928#elif defined(_XM_SSE_INTRINSICS_)
1929
1930    XMASSERT(pSource);
1931    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
1932
1933    return _mm_load_ps( &pSource->x );
1934
1935#else // _XM_VMX128_INTRINSICS_
1936#endif // _XM_VMX128_INTRINSICS_
1937}
1938
1939//------------------------------------------------------------------------------
1940
1941XMFINLINE XMVECTOR XMLoadHalf4
1942(
1943    CONST XMHALF4* pSource
1944)
1945{
1946#if defined(_XM_NO_INTRINSICS_)
1947    XMASSERT(pSource);
1948    {
1949    XMVECTOR vResult = {
1950        XMConvertHalfToFloat(pSource->x),
1951        XMConvertHalfToFloat(pSource->y),
1952        XMConvertHalfToFloat(pSource->z),
1953        XMConvertHalfToFloat(pSource->w)
1954    };
1955    return vResult;
1956    }
1957#elif defined(_XM_SSE_INTRINSICS_)
1958    XMASSERT(pSource);
1959    XMVECTOR vResult = {
1960        XMConvertHalfToFloat(pSource->x),
1961        XMConvertHalfToFloat(pSource->y),
1962        XMConvertHalfToFloat(pSource->z),
1963        XMConvertHalfToFloat(pSource->w)
1964    };
1965    return vResult;
1966#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
1967#endif // _XM_VMX128_INTRINSICS_
1968}
1969
1970//------------------------------------------------------------------------------
1971
1972XMFINLINE XMVECTOR XMLoadShortN4
1973(
1974    CONST XMSHORTN4* pSource
1975)
1976{
1977#if defined(_XM_NO_INTRINSICS_)
1978    XMASSERT(pSource);
1979    {
1980    XMVECTOR vResult = {
1981        (pSource->x == -32768) ? -1.f : ((FLOAT)pSource->x * (1.0f/32767.0f)),
1982        (pSource->y == -32768) ? -1.f : ((FLOAT)pSource->y * (1.0f/32767.0f)),
1983        (pSource->z == -32768) ? -1.f : ((FLOAT)pSource->z * (1.0f/32767.0f)),
1984        (pSource->w == -32768) ? -1.f : ((FLOAT)pSource->w * (1.0f/32767.0f))
1985    };
1986    return vResult;
1987    }
1988#elif defined(_XM_SSE_INTRINSICS_)
1989    XMASSERT(pSource);
1990    // Splat the color in all four entries (x,z,y,w)
1991    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
1992    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
1993    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
1994    // x and z are unsigned! Flip the bits to convert the order to signed
1995    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
1996    // Convert to floating point numbers
1997    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
1998    // x and z - 0x8000 to complete the conversion
1999    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
2000    // Convert to -1.0f - 1.0f
2001    vTemp = _mm_mul_ps(vTemp,g_XMNormalizeX16Y16Z16W16);
2002    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2003    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2004    // Clamp result (for case of -32768)
2005    return _mm_max_ps( vTemp, g_XMNegativeOne );
2006#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2007#endif // _XM_VMX128_INTRINSICS_
2008}
2009
2010//------------------------------------------------------------------------------
2011
2012XMFINLINE XMVECTOR XMLoadShort4
2013(
2014    CONST XMSHORT4* pSource
2015)
2016{
2017#if defined(_XM_NO_INTRINSICS_)
2018
2019    XMVECTOR V;
2020
2021    XMASSERT(pSource);
2022
2023    V.vector4_f32[0] = (FLOAT)pSource->x;
2024    V.vector4_f32[1] = (FLOAT)pSource->y;
2025    V.vector4_f32[2] = (FLOAT)pSource->z;
2026    V.vector4_f32[3] = (FLOAT)pSource->w;
2027
2028    return V;
2029
2030#elif defined(_XM_SSE_INTRINSICS_)
2031    XMASSERT(pSource);
2032    // Splat the color in all four entries (x,z,y,w)
2033    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2034    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2035    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2036    // x and z are unsigned! Flip the bits to convert the order to signed
2037    vTemp = _mm_xor_ps(vTemp,g_XMFlipX16Y16Z16W16);
2038    // Convert to floating point numbers
2039    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2040    // x and z - 0x8000 to complete the conversion
2041    vTemp = _mm_add_ps(vTemp,g_XMFixX16Y16Z16W16);
2042    // Fix y and w because they are 65536 too large
2043    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
2044    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2045    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2046#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2047#endif // _XM_VMX128_INTRINSICS_
2048}
2049
2050//------------------------------------------------------------------------------
2051
2052XMFINLINE XMVECTOR XMLoadUShortN4
2053(
2054    CONST XMUSHORTN4* pSource
2055)
2056{
2057#if defined(_XM_NO_INTRINSICS_)
2058
2059    XMVECTOR V;
2060
2061    XMASSERT(pSource);
2062
2063    V.vector4_f32[0] = (FLOAT)pSource->x / 65535.0f;
2064    V.vector4_f32[1] = (FLOAT)pSource->y / 65535.0f;
2065    V.vector4_f32[2] = (FLOAT)pSource->z / 65535.0f;
2066    V.vector4_f32[3] = (FLOAT)pSource->w / 65535.0f;
2067
2068    return V;
2069
2070#elif defined(_XM_SSE_INTRINSICS_)
2071    XMASSERT(pSource);
2072    static const XMVECTORF32 FixupY16W16 = {1.0f/65535.0f,1.0f/65535.0f,1.0f/(65535.0f*65536.0f),1.0f/(65535.0f*65536.0f)};
2073    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f*65536.0f,32768.0f*65536.0f};
2074    XMASSERT(pSource);
2075    // Splat the color in all four entries (x,z,y,w)
2076    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2077    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2078    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2079    // y and w are signed! Flip the bits to convert the order to unsigned
2080    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
2081    // Convert to floating point numbers
2082    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2083    // y and w + 0x8000 to complete the conversion
2084    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
2085    // Fix y and w because they are 65536 too large
2086    vTemp = _mm_mul_ps(vTemp,FixupY16W16);
2087    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2088    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2089#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2090#endif // _XM_VMX128_INTRINSICS_
2091}
2092
2093//------------------------------------------------------------------------------
2094
2095XMFINLINE XMVECTOR XMLoadUShort4
2096(
2097    CONST XMUSHORT4* pSource
2098)
2099{
2100#if defined(_XM_NO_INTRINSICS_)
2101
2102    XMVECTOR V;
2103
2104    XMASSERT(pSource);
2105
2106    V.vector4_f32[0] = (FLOAT)pSource->x;
2107    V.vector4_f32[1] = (FLOAT)pSource->y;
2108    V.vector4_f32[2] = (FLOAT)pSource->z;
2109    V.vector4_f32[3] = (FLOAT)pSource->w;
2110
2111    return V;
2112
2113#elif defined(_XM_SSE_INTRINSICS_)
2114    XMASSERT(pSource);
2115    static const XMVECTORF32 FixaddY16W16  = {0,0,32768.0f,32768.0f};
2116    XMASSERT(pSource);
2117    // Splat the color in all four entries (x,z,y,w)
2118    __m128d vIntd = _mm_load1_pd(reinterpret_cast<const double *>(&pSource->x));
2119    // Shift x&0ffff,z&0xffff,y&0xffff0000,w&0xffff0000
2120    __m128 vTemp = _mm_and_ps(reinterpret_cast<const __m128 *>(&vIntd)[0],g_XMMaskX16Y16Z16W16);
2121    // y and w are signed! Flip the bits to convert the order to unsigned
2122    vTemp = _mm_xor_ps(vTemp,g_XMFlipZW);
2123    // Convert to floating point numbers
2124    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2125    // Fix y and w because they are 65536 too large
2126    vTemp = _mm_mul_ps(vTemp,g_XMFixupY16W16);
2127    // y and w + 0x8000 to complete the conversion
2128    vTemp = _mm_add_ps(vTemp,FixaddY16W16);
2129    // Very important! The entries are x,z,y,w, flip it to x,y,z,w
2130    return _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(3,1,2,0));
2131#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2132#endif // _XM_VMX128_INTRINSICS_
2133}
2134
2135//------------------------------------------------------------------------------
2136
2137XMFINLINE XMVECTOR XMLoadXIcoN4
2138(
2139    CONST XMXICON4* pSource
2140)
2141{
2142#if defined(_XM_NO_INTRINSICS_)
2143
2144    XMVECTOR          V;
2145    UINT              Element;
2146    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2147
2148    XMASSERT(pSource);
2149    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2150    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2151    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2152
2153    Element = (UINT)(pSource->v & 0xFFFFF);
2154    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2155    Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2156    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2157    Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2158    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2159    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
2160
2161    return V;
2162
2163#elif defined(_XM_SSE_INTRINSICS_)
2164    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2165    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2166    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2167    static const XMVECTORF32 LoadXIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(15.0f*4096.0f*65536.0f)};
2168    XMASSERT(pSource);
2169    // Grab the 64 bit structure
2170    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2171    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2172    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2173    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2174    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2175    // Fix the entries to x,y,z,w
2176    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2177    // Mask x,y,z and w
2178    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2179    // x and z are unsigned! Flip the bits to convert the order to signed
2180    vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
2181    // Convert to floating point numbers
2182    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2183    // x and z - 0x80 to complete the conversion
2184    vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
2185    // Fix y and w because they are too large
2186    vTemp = _mm_mul_ps(vTemp,LoadXIcoN4Mul);
2187    return vTemp;
2188#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2189#endif // _XM_VMX128_INTRINSICS_
2190}
2191
2192//------------------------------------------------------------------------------
2193
2194XMFINLINE XMVECTOR XMLoadXIco4
2195(
2196    CONST XMXICO4* pSource
2197)
2198{
2199#if defined(_XM_NO_INTRINSICS_)
2200
2201    XMVECTOR          V;
2202    UINT              Element;
2203    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2204
2205    XMASSERT(pSource);
2206    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2207    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2208    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2209
2210    Element = (UINT)(pSource->v & 0xFFFFF);
2211    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2212    Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2213    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2214    Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2215    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2216    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
2217
2218    return V;
2219
2220#elif defined(_XM_SSE_INTRINSICS_)
2221    XMASSERT((pSource->v & 0xFFFFFull) != 0x80000ull);
2222    XMASSERT(((pSource->v >> 20) & 0xFFFFFull) != 0x80000ull);
2223    XMASSERT(((pSource->v >> 40) & 0xFFFFFull) != 0x80000ull);
2224    XMASSERT(pSource);
2225    // Grab the 64 bit structure
2226    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2227    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2228    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2229    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2230    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2231    // Fix the entries to x,y,z,w
2232    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2233    // Mask x,y,z and w
2234    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2235    // x and z are unsigned! Flip the bits to convert the order to signed
2236    vTemp = _mm_xor_ps(vTemp,g_XMXorXIco4);
2237    // Convert to floating point numbers
2238    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2239    // x and z - 0x80 to complete the conversion
2240    vTemp = _mm_add_ps(vTemp,g_XMAddXIco4);
2241    // Fix y and w because they are too large
2242    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2243    return vTemp;
2244#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2245#endif // _XM_VMX128_INTRINSICS_
2246}
2247
2248//------------------------------------------------------------------------------
2249
2250XMFINLINE XMVECTOR XMLoadUIcoN4
2251(
2252    CONST XMUICON4* pSource
2253)
2254{
2255#if defined(_XM_NO_INTRINSICS_)
2256
2257    XMVECTOR V;
2258
2259    XMASSERT(pSource);
2260
2261    V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF) / 1048575.0f;
2262    V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF) / 1048575.0f;
2263    V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF) / 1048575.0f;
2264    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60) / 15.0f;
2265
2266    return V;
2267
2268#elif defined(_XM_SSE_INTRINSICS_)
2269    static const XMVECTORF32 LoadUIcoN4Mul = {1.0f/1048575.0f,1.0f/(1048575.0f*4096.0f),1.0f/1048575.0f,1.0f/(15.0f*4096.0f*65536.0f)};
2270    XMASSERT(pSource);
2271    // Grab the 64 bit structure
2272    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2273    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2274    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2275    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2276    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2277    // Fix the entries to x,y,z,w
2278    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2279    // Mask x,y,z and w
2280    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2281    // x and z are unsigned! Flip the bits to convert the order to signed
2282    vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
2283    // Convert to floating point numbers
2284    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2285    // x and z - 0x80 to complete the conversion
2286    vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2287    // Fix y and w because they are too large
2288    vTemp = _mm_mul_ps(vTemp,LoadUIcoN4Mul);
2289    return vTemp;
2290#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2291#endif // _XM_VMX128_INTRINSICS_
2292}
2293
2294//------------------------------------------------------------------------------
2295
2296XMFINLINE XMVECTOR XMLoadUIco4
2297(
2298    CONST XMUICO4* pSource
2299)
2300{
2301#if defined(_XM_NO_INTRINSICS_)
2302
2303    XMVECTOR V;
2304
2305    XMASSERT(pSource);
2306
2307    V.vector4_f32[0] = (FLOAT)(pSource->v & 0xFFFFF);
2308    V.vector4_f32[1] = (FLOAT)((pSource->v >> 20) & 0xFFFFF);
2309    V.vector4_f32[2] = (FLOAT)((pSource->v >> 40) & 0xFFFFF);
2310    V.vector4_f32[3] = (FLOAT)(pSource->v >> 60);
2311
2312    return V;
2313
2314#elif defined(_XM_SSE_INTRINSICS_)
2315    XMASSERT(pSource);
2316    // Grab the 64 bit structure
2317    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2318    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2319    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2320    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2321    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2322    // Fix the entries to x,y,z,w
2323    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2324    // Mask x,y,z and w
2325    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2326    // x and z are unsigned! Flip the bits to convert the order to signed
2327    vTemp = _mm_xor_ps(vTemp,g_XMFlipYW);
2328    // Convert to floating point numbers
2329    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2330    // x and z - 0x80 to complete the conversion
2331    vTemp = _mm_add_ps(vTemp,g_XMAddUIco4);
2332    // Fix y and w because they are too large
2333    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2334    return vTemp;
2335#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2336#endif // _XM_VMX128_INTRINSICS_
2337}
2338
2339//------------------------------------------------------------------------------
2340
2341XMFINLINE XMVECTOR XMLoadIcoN4
2342(
2343    CONST XMICON4* pSource
2344)
2345{
2346#if defined(_XM_NO_INTRINSICS_)
2347
2348    XMVECTOR          V;
2349    UINT              Element;
2350    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2351    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2352
2353    XMASSERT(pSource);
2354
2355    Element = (UINT)(pSource->v & 0xFFFFF);
2356    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2357    Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2358    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2359    Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2360    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]) / 524287.0f;
2361    Element = (UINT)(pSource->v >> 60);
2362    V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]) / 7.0f;
2363
2364    return V;
2365
2366#elif defined(_XM_SSE_INTRINSICS_)
2367    static const XMVECTORF32 LoadIcoN4Mul = {1.0f/524287.0f,1.0f/(524287.0f*4096.0f),1.0f/524287.0f,1.0f/(7.0f*4096.0f*65536.0f)};
2368    XMASSERT(pSource);
2369    // Grab the 64 bit structure
2370    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2371    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2372    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2373    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2374    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2375    // Fix the entries to x,y,z,w
2376    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2377    // Mask x,y,z and w
2378    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2379    // x and z are unsigned! Flip the bits to convert the order to signed
2380    vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2381    // Convert to floating point numbers
2382    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2383    // x and z - 0x80 to complete the conversion
2384    vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2385    // Fix y and w because they are too large
2386    vTemp = _mm_mul_ps(vTemp,LoadIcoN4Mul);
2387    return vTemp;
2388#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2389#endif // _XM_VMX128_INTRINSICS_
2390}
2391
2392//------------------------------------------------------------------------------
2393
2394XMFINLINE XMVECTOR XMLoadIco4
2395(
2396    CONST XMICO4* pSource
2397)
2398{
2399#if defined(_XM_NO_INTRINSICS_)
2400
2401    XMVECTOR          V;
2402    UINT              Element;
2403    static CONST UINT SignExtend[] = {0x00000000, 0xFFF00000};
2404    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFF0};
2405
2406    XMASSERT(pSource);
2407
2408    Element = (UINT)(pSource->v & 0xFFFFF);
2409    V.vector4_f32[0] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2410    Element = (UINT)((pSource->v >> 20) & 0xFFFFF);
2411    V.vector4_f32[1] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2412    Element = (UINT)((pSource->v >> 40) & 0xFFFFF);
2413    V.vector4_f32[2] = (FLOAT)(INT)(Element | SignExtend[Element >> 19]);
2414    Element = (UINT)(pSource->v >> 60);
2415    V.vector4_f32[3] = (FLOAT)(INT)(Element | SignExtendW[Element >> 3]);
2416
2417    return V;
2418
2419#elif defined(_XM_SSE_INTRINSICS_)
2420    XMASSERT(pSource);
2421    // Grab the 64 bit structure
2422    __m128d vResultd = _mm_load_sd(reinterpret_cast<const double *>(&pSource->v));
2423    // By shifting down 8 bits, y and z are in seperate 32 bit elements
2424    __m128i vResulti = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vResultd)[0],8/8);
2425    // vResultd has x and w, vResulti has y and z, merge into one as x,w,y,z
2426    XMVECTOR vTemp = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResultd)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(1,0,1,0));
2427    // Fix the entries to x,y,z,w
2428    vTemp = _mm_shuffle_ps(vTemp,vTemp,_MM_SHUFFLE(1,3,2,0));
2429    // Mask x,y,z and w
2430    vTemp = _mm_and_ps(vTemp,g_XMMaskIco4);
2431    // x and z are unsigned! Flip the bits to convert the order to signed
2432    vTemp = _mm_xor_ps(vTemp,g_XMXorIco4);
2433    // Convert to floating point numbers
2434    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2435    // x and z - 0x80 to complete the conversion
2436    vTemp = _mm_add_ps(vTemp,g_XMAddIco4);
2437    // Fix y and w because they are too large
2438    vTemp = _mm_mul_ps(vTemp,g_XMMulIco4);
2439    return vTemp;
2440#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2441#endif // _XM_VMX128_INTRINSICS_
2442}
2443
2444
2445//------------------------------------------------------------------------------
2446
2447XMFINLINE XMVECTOR XMLoadXDecN4
2448(
2449    CONST XMXDECN4* pSource
2450)
2451{
2452#if defined(_XM_NO_INTRINSICS_)
2453    XMVECTOR V;
2454    UINT Element;
2455    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2456
2457    XMASSERT(pSource);
2458    XMASSERT((pSource->v & 0x3FF) != 0x200);
2459    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2460    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2461
2462    Element = pSource->v & 0x3FF;
2463    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2464    Element = (pSource->v >> 10) & 0x3FF;
2465    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2466    Element = (pSource->v >> 20) & 0x3FF;
2467    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2468    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2469
2470    return V;
2471
2472#elif defined(_XM_SSE_INTRINSICS_)
2473    XMASSERT(pSource);
2474    // Splat the color in all four entries
2475    __m128 vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2476    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2477    vTemp = _mm_and_ps(vTemp,g_XMMaskA2B10G10R10);
2478    // a is unsigned! Flip the bit to convert the order to signed
2479    vTemp = _mm_xor_ps(vTemp,g_XMFlipA2B10G10R10);
2480    // Convert to floating point numbers
2481    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2482    // RGB + 0, A + 0x80000000.f to undo the signed order.
2483    vTemp = _mm_add_ps(vTemp,g_XMFixAA2B10G10R10);
2484    // Convert 0-255 to 0.0f-1.0f
2485    return _mm_mul_ps(vTemp,g_XMNormalizeA2B10G10R10);
2486#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2487#endif // _XM_VMX128_INTRINSICS_
2488}
2489
2490//------------------------------------------------------------------------------
2491
2492XMFINLINE XMVECTOR XMLoadXDec4
2493(
2494    CONST XMXDEC4* pSource
2495)
2496{
2497#if defined(_XM_NO_INTRINSICS_)
2498
2499    XMVECTOR          V;
2500    UINT              Element;
2501    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2502
2503    XMASSERT(pSource);
2504    XMASSERT((pSource->v & 0x3FF) != 0x200);
2505    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2506    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2507
2508    Element = pSource->v & 0x3FF;
2509    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2510    Element = (pSource->v >> 10) & 0x3FF;
2511    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2512    Element = (pSource->v >> 20) & 0x3FF;
2513    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2514    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2515
2516    return V;
2517
2518#elif defined(_XM_SSE_INTRINSICS_)
2519    XMASSERT((pSource->v & 0x3FF) != 0x200);
2520    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2521    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2522    static const XMVECTORI32 XDec4Xor = {0x200, 0x200<<10, 0x200<<20, 0x80000000};
2523    static const XMVECTORF32 XDec4Add = {-512.0f,-512.0f*1024.0f,-512.0f*1024.0f*1024.0f,32768*65536.0f};
2524    XMASSERT(pSource);
2525    // Splat the color in all four entries
2526    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2527    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2528    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2529    // a is unsigned! Flip the bit to convert the order to signed
2530    vTemp = _mm_xor_ps(vTemp,XDec4Xor);
2531    // Convert to floating point numbers
2532    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2533    // RGB + 0, A + 0x80000000.f to undo the signed order.
2534    vTemp = _mm_add_ps(vTemp,XDec4Add);
2535    // Convert 0-255 to 0.0f-1.0f
2536    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2537    return vTemp;
2538#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2539#endif // _XM_VMX128_INTRINSICS_
2540}
2541
2542//------------------------------------------------------------------------------
2543
2544XMFINLINE XMVECTOR XMLoadUDecN4
2545(
2546    CONST XMUDECN4* pSource
2547)
2548{
2549#if defined(_XM_NO_INTRINSICS_)
2550
2551    XMVECTOR          V;
2552    UINT              Element;
2553
2554    XMASSERT(pSource);
2555
2556    Element = pSource->v & 0x3FF;
2557    V.vector4_f32[0] = (FLOAT)Element / 1023.0f;
2558    Element = (pSource->v >> 10) & 0x3FF;
2559    V.vector4_f32[1] = (FLOAT)Element / 1023.0f;
2560    Element = (pSource->v >> 20) & 0x3FF;
2561    V.vector4_f32[2] = (FLOAT)Element / 1023.0f;
2562    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30) / 3.0f;
2563
2564    return V;
2565
2566#elif defined(_XM_SSE_INTRINSICS_)
2567    XMASSERT(pSource);
2568    static const XMVECTORF32 UDecN4Mul = {1.0f/1023.0f,1.0f/(1023.0f*1024.0f),1.0f/(1023.0f*1024.0f*1024.0f),1.0f/(3.0f*1024.0f*1024.0f*1024.0f)};
2569    // Splat the color in all four entries
2570    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2571    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2572    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2573    // a is unsigned! Flip the bit to convert the order to signed
2574    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2575    // Convert to floating point numbers
2576    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2577    // RGB + 0, A + 0x80000000.f to undo the signed order.
2578    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2579    // Convert 0-255 to 0.0f-1.0f
2580    vTemp = _mm_mul_ps(vTemp,UDecN4Mul);
2581    return vTemp;
2582#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2583#endif // _XM_VMX128_INTRINSICS_
2584}
2585
2586//------------------------------------------------------------------------------
2587
2588XMFINLINE XMVECTOR XMLoadUDec4
2589(
2590    CONST XMUDEC4* pSource
2591)
2592{
2593#if defined(_XM_NO_INTRINSICS_)
2594
2595    XMVECTOR          V;
2596    UINT              Element;
2597
2598    XMASSERT(pSource);
2599
2600    Element = pSource->v & 0x3FF;
2601    V.vector4_f32[0] = (FLOAT)Element;
2602    Element = (pSource->v >> 10) & 0x3FF;
2603    V.vector4_f32[1] = (FLOAT)Element;
2604    Element = (pSource->v >> 20) & 0x3FF;
2605    V.vector4_f32[2] = (FLOAT)Element;
2606    V.vector4_f32[3] = (FLOAT)(pSource->v >> 30);
2607
2608    return V;
2609
2610#elif defined(_XM_SSE_INTRINSICS_)
2611    XMASSERT(pSource);
2612    // Splat the color in all four entries
2613    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2614    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2615    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2616    // a is unsigned! Flip the bit to convert the order to signed
2617    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2618    // Convert to floating point numbers
2619    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2620    // RGB + 0, A + 0x80000000.f to undo the signed order.
2621    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2622    // Convert 0-255 to 0.0f-1.0f
2623    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2624    return vTemp;
2625#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2626#endif // _XM_VMX128_INTRINSICS_
2627}
2628
2629//------------------------------------------------------------------------------
2630
2631XMFINLINE XMVECTOR XMLoadDecN4
2632(
2633    CONST XMDECN4* pSource
2634)
2635{
2636#if defined(_XM_NO_INTRINSICS_)
2637
2638    XMVECTOR          V;
2639    UINT              Element;
2640    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2641    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2642
2643    XMASSERT(pSource);
2644    XMASSERT((pSource->v & 0x3FF) != 0x200);
2645    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2646    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2647    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2648
2649    Element = pSource->v & 0x3FF;
2650    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2651    Element = (pSource->v >> 10) & 0x3FF;
2652    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2653    Element = (pSource->v >> 20) & 0x3FF;
2654    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]) / 511.0f;
2655    Element = pSource->v >> 30;
2656    V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2657
2658    return V;
2659
2660#elif defined(_XM_SSE_INTRINSICS_)
2661    XMASSERT(pSource);
2662    XMASSERT((pSource->v & 0x3FF) != 0x200);
2663    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2664    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2665    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2666    static const XMVECTORF32 DecN4Mul = {1.0f/511.0f,1.0f/(511.0f*1024.0f),1.0f/(511.0f*1024.0f*1024.0f),1.0f/(1024.0f*1024.0f*1024.0f)};
2667    // Splat the color in all four entries
2668    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2669    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2670    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2671    // a is unsigned! Flip the bit to convert the order to signed
2672    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2673    // Convert to floating point numbers
2674    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2675    // RGB + 0, A + 0x80000000.f to undo the signed order.
2676    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2677    // Convert 0-255 to 0.0f-1.0f
2678    vTemp = _mm_mul_ps(vTemp,DecN4Mul);
2679    return vTemp;
2680#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2681#endif // _XM_VMX128_INTRINSICS_
2682}
2683
2684//------------------------------------------------------------------------------
2685
2686XMFINLINE XMVECTOR XMLoadDec4
2687(
2688    CONST XMDEC4* pSource
2689)
2690{
2691#if defined(_XM_NO_INTRINSICS_)
2692
2693    XMVECTOR          V;
2694    UINT              Element;
2695    static CONST UINT SignExtend[] = {0x00000000, 0xFFFFFC00};
2696    static CONST UINT SignExtendW[] = {0x00000000, 0xFFFFFFFC};
2697
2698    XMASSERT(pSource);
2699    XMASSERT((pSource->v & 0x3FF) != 0x200);
2700    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2701    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2702    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2703
2704    Element = pSource->v & 0x3FF;
2705    V.vector4_f32[0] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2706    Element = (pSource->v >> 10) & 0x3FF;
2707    V.vector4_f32[1] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2708    Element = (pSource->v >> 20) & 0x3FF;
2709    V.vector4_f32[2] = (FLOAT)(SHORT)(Element | SignExtend[Element >> 9]);
2710    Element = pSource->v >> 30;
2711    V.vector4_f32[3] = (FLOAT)(SHORT)(Element | SignExtendW[Element >> 1]);
2712
2713    return V;
2714
2715#elif defined(_XM_SSE_INTRINSICS_)
2716    XMASSERT((pSource->v & 0x3FF) != 0x200);
2717    XMASSERT(((pSource->v >> 10) & 0x3FF) != 0x200);
2718    XMASSERT(((pSource->v >> 20) & 0x3FF) != 0x200);
2719    XMASSERT(((pSource->v >> 30) & 0x3) != 0x2);
2720    XMASSERT(pSource);
2721    // Splat the color in all four entries
2722    XMVECTOR vTemp = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2723    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
2724    vTemp = _mm_and_ps(vTemp,g_XMMaskDec4);
2725    // a is unsigned! Flip the bit to convert the order to signed
2726    vTemp = _mm_xor_ps(vTemp,g_XMXorDec4);
2727    // Convert to floating point numbers
2728    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2729    // RGB + 0, A + 0x80000000.f to undo the signed order.
2730    vTemp = _mm_add_ps(vTemp,g_XMAddDec4);
2731    // Convert 0-255 to 0.0f-1.0f
2732    vTemp = _mm_mul_ps(vTemp,g_XMMulDec4);
2733    return vTemp;
2734#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2735#endif // _XM_VMX128_INTRINSICS_
2736}
2737
2738//------------------------------------------------------------------------------
2739
2740XMFINLINE XMVECTOR XMLoadUByteN4
2741(
2742    CONST XMUBYTEN4* pSource
2743)
2744{
2745#if defined(_XM_NO_INTRINSICS_)
2746
2747    XMVECTOR V;
2748
2749    XMASSERT(pSource);
2750
2751    V.vector4_f32[0] = (FLOAT)pSource->x / 255.0f;
2752    V.vector4_f32[1] = (FLOAT)pSource->y / 255.0f;
2753    V.vector4_f32[2] = (FLOAT)pSource->z / 255.0f;
2754    V.vector4_f32[3] = (FLOAT)pSource->w / 255.0f;
2755
2756    return V;
2757
2758#elif defined(_XM_SSE_INTRINSICS_)
2759    static const XMVECTORF32 LoadUByteN4Mul = {1.0f/255.0f,1.0f/(255.0f*256.0f),1.0f/(255.0f*65536.0f),1.0f/(255.0f*65536.0f*256.0f)};
2760    XMASSERT(pSource);
2761    // Splat the color in all four entries (x,z,y,w)
2762    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2763    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2764    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2765    // w is signed! Flip the bits to convert the order to unsigned
2766    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2767    // Convert to floating point numbers
2768    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2769    // w + 0x80 to complete the conversion
2770    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2771    // Fix y, z and w because they are too large
2772    vTemp = _mm_mul_ps(vTemp,LoadUByteN4Mul);
2773    return vTemp;
2774#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2775#endif // _XM_VMX128_INTRINSICS_
2776}
2777
2778//------------------------------------------------------------------------------
2779
2780XMFINLINE XMVECTOR XMLoadUByte4
2781(
2782    CONST XMUBYTE4* pSource
2783)
2784{
2785#if defined(_XM_NO_INTRINSICS_)
2786
2787    XMVECTOR V;
2788
2789    XMASSERT(pSource);
2790
2791    V.vector4_f32[0] = (FLOAT)pSource->x;
2792    V.vector4_f32[1] = (FLOAT)pSource->y;
2793    V.vector4_f32[2] = (FLOAT)pSource->z;
2794    V.vector4_f32[3] = (FLOAT)pSource->w;
2795
2796    return V;
2797
2798#elif defined(_XM_SSE_INTRINSICS_)
2799    static const XMVECTORF32 LoadUByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2800    XMASSERT(pSource);
2801    // Splat the color in all four entries (x,z,y,w)
2802    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2803    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2804    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2805    // w is signed! Flip the bits to convert the order to unsigned
2806    vTemp = _mm_xor_ps(vTemp,g_XMFlipW);
2807    // Convert to floating point numbers
2808    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2809    // w + 0x80 to complete the conversion
2810    vTemp = _mm_add_ps(vTemp,g_XMAddUDec4);
2811    // Fix y, z and w because they are too large
2812    vTemp = _mm_mul_ps(vTemp,LoadUByte4Mul);
2813    return vTemp;
2814#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2815#endif // _XM_VMX128_INTRINSICS_
2816}
2817
2818//------------------------------------------------------------------------------
2819
2820XMFINLINE XMVECTOR XMLoadByteN4
2821(
2822    CONST XMBYTEN4* pSource
2823)
2824{
2825#if defined(_XM_NO_INTRINSICS_)
2826
2827    XMVECTOR V;
2828
2829    XMASSERT(pSource);
2830
2831    V.vector4_f32[0] = (pSource->x == -128) ? -1.f : ((FLOAT)pSource->x / 127.0f);
2832    V.vector4_f32[1] = (pSource->y == -128) ? -1.f : ((FLOAT)pSource->y / 127.0f);
2833    V.vector4_f32[2] = (pSource->z == -128) ? -1.f : ((FLOAT)pSource->z / 127.0f);
2834    V.vector4_f32[3] = (pSource->w == -128) ? -1.f : ((FLOAT)pSource->w / 127.0f);
2835
2836    return V;
2837
2838#elif defined(_XM_SSE_INTRINSICS_)
2839    static const XMVECTORF32 LoadByteN4Mul = {1.0f/127.0f,1.0f/(127.0f*256.0f),1.0f/(127.0f*65536.0f),1.0f/(127.0f*65536.0f*256.0f)};
2840    XMASSERT(pSource);
2841    // Splat the color in all four entries (x,z,y,w)
2842    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2843    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2844    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2845    // x,y and z are unsigned! Flip the bits to convert the order to signed
2846    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2847    // Convert to floating point numbers
2848    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2849    // x, y and z - 0x80 to complete the conversion
2850    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2851    // Fix y, z and w because they are too large
2852    vTemp = _mm_mul_ps(vTemp,LoadByteN4Mul);
2853    // Clamp result (for case of -128)
2854    return _mm_max_ps( vTemp, g_XMNegativeOne );
2855#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2856#endif // _XM_VMX128_INTRINSICS_
2857}
2858
2859//------------------------------------------------------------------------------
2860
2861XMFINLINE XMVECTOR XMLoadByte4
2862(
2863    CONST XMBYTE4* pSource
2864)
2865{
2866#if defined(_XM_NO_INTRINSICS_)
2867
2868    XMVECTOR V;
2869
2870    XMASSERT(pSource);
2871
2872    V.vector4_f32[0] = (FLOAT)pSource->x;
2873    V.vector4_f32[1] = (FLOAT)pSource->y;
2874    V.vector4_f32[2] = (FLOAT)pSource->z;
2875    V.vector4_f32[3] = (FLOAT)pSource->w;
2876
2877    return V;
2878
2879#elif defined(_XM_SSE_INTRINSICS_)
2880    static const XMVECTORF32 LoadByte4Mul = {1.0f,1.0f/256.0f,1.0f/65536.0f,1.0f/(65536.0f*256.0f)};
2881    XMASSERT(pSource);
2882    // Splat the color in all four entries (x,z,y,w)
2883    XMVECTOR vTemp = _mm_load1_ps(reinterpret_cast<const float *>(&pSource->x));
2884    // Mask x&0ff,y&0xff00,z&0xff0000,w&0xff000000
2885    vTemp = _mm_and_ps(vTemp,g_XMMaskByte4);
2886    // x,y and z are unsigned! Flip the bits to convert the order to signed
2887    vTemp = _mm_xor_ps(vTemp,g_XMXorByte4);
2888    // Convert to floating point numbers
2889    vTemp = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vTemp)[0]);
2890    // x, y and z - 0x80 to complete the conversion
2891    vTemp = _mm_add_ps(vTemp,g_XMAddByte4);
2892    // Fix y, z and w because they are too large
2893    vTemp = _mm_mul_ps(vTemp,LoadByte4Mul);
2894    return vTemp;
2895#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
2896#endif // _XM_VMX128_INTRINSICS_
2897}
2898
2899//------------------------------------------------------------------------------
2900
2901XMFINLINE XMVECTOR XMLoadUNibble4
2902(
2903     CONST XMUNIBBLE4* pSource
2904)
2905{
2906#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2907    static const XMVECTORI32 UNibble4And = {0xF,0xF0,0xF00,0xF000};
2908    static const XMVECTORF32 UNibble4Mul = {1.0f,1.0f/16.f,1.0f/256.f,1.0f/4096.f};
2909    XMASSERT(pSource);
2910    // Get the 32 bit value and splat it
2911    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2912    // Mask off x, y and z
2913    vResult = _mm_and_ps(vResult,UNibble4And);
2914    // Convert to float
2915    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2916    // Normalize x, y, and z
2917    vResult = _mm_mul_ps(vResult,UNibble4Mul);
2918    return vResult;
2919#else
2920    XMVECTOR          V;
2921    UINT              Element;
2922
2923    XMASSERT(pSource);
2924
2925    Element = pSource->v & 0xF;
2926    V.vector4_f32[0] = (FLOAT)Element;
2927    Element = (pSource->v >> 4) & 0xF;
2928    V.vector4_f32[1] = (FLOAT)Element;
2929    Element = (pSource->v >> 8) & 0xF;
2930    V.vector4_f32[2] = (FLOAT)Element;
2931    Element = (pSource->v >> 12) & 0xF;
2932    V.vector4_f32[3] = (FLOAT)Element;
2933
2934    return V;
2935#endif // !_XM_SSE_INTRISICS_
2936}
2937
2938//------------------------------------------------------------------------------
2939
2940XMFINLINE XMVECTOR XMLoadU555
2941(
2942     CONST XMU555* pSource
2943)
2944{
2945#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
2946    static const XMVECTORI32 U555And = {0x1F,0x1F<<5,0x1F<<10,0x8000};
2947    static const XMVECTORF32 U555Mul = {1.0f,1.0f/32.f,1.0f/1024.f,1.0f/32768.f};
2948    XMASSERT(pSource);
2949    // Get the 32 bit value and splat it
2950    XMVECTOR vResult = _mm_load_ps1(reinterpret_cast<const float *>(&pSource->v));
2951    // Mask off x, y and z
2952    vResult = _mm_and_ps(vResult,U555And);
2953    // Convert to float
2954    vResult = _mm_cvtepi32_ps(reinterpret_cast<const __m128i *>(&vResult)[0]);
2955    // Normalize x, y, and z
2956    vResult = _mm_mul_ps(vResult,U555Mul);
2957    return vResult;
2958#else
2959    XMVECTOR          V;
2960    UINT              Element;
2961
2962    XMASSERT(pSource);
2963
2964    Element = pSource->v & 0x1F;
2965    V.vector4_f32[0] = (FLOAT)Element;
2966    Element = (pSource->v >> 5) & 0x1F;
2967    V.vector4_f32[1] = (FLOAT)Element;
2968    Element = (pSource->v >> 10) & 0x1F;
2969    V.vector4_f32[2] = (FLOAT)Element;
2970    Element = (pSource->v >> 15) & 0x1;
2971    V.vector4_f32[3] = (FLOAT)Element;
2972
2973    return V;
2974#endif // !_XM_SSE_INTRISICS_
2975}
2976
2977//------------------------------------------------------------------------------
2978
2979XMFINLINE XMVECTOR XMLoadColor
2980(
2981    CONST XMCOLOR* pSource
2982)
2983{
2984#if defined(_XM_NO_INTRINSICS_)
2985    XMASSERT(pSource);
2986    {
2987    // INT -> Float conversions are done in one instruction.
2988    // UINT -> Float calls a runtime function. Keep in INT
2989    INT iColor = (INT)(pSource->c);
2990    XMVECTOR vColor = {
2991        (FLOAT)((iColor >> 16) & 0xFF) * (1.0f/255.0f),
2992        (FLOAT)((iColor >> 8) & 0xFF) * (1.0f/255.0f),
2993        (FLOAT)(iColor & 0xFF) * (1.0f/255.0f),
2994        (FLOAT)((iColor >> 24) & 0xFF) * (1.0f/255.0f)
2995    };
2996    return vColor;
2997    }
2998#elif defined(_XM_SSE_INTRINSICS_)
2999    XMASSERT(pSource);
3000    // Splat the color in all four entries
3001    __m128i vInt = _mm_set1_epi32(pSource->c);
3002    // Shift R&0xFF0000, G&0xFF00, B&0xFF, A&0xFF000000
3003    vInt = _mm_and_si128(vInt,g_XMMaskA8R8G8B8);
3004    // a is unsigned! Flip the bit to convert the order to signed
3005    vInt = _mm_xor_si128(vInt,g_XMFlipA8R8G8B8);
3006    // Convert to floating point numbers
3007    XMVECTOR vTemp = _mm_cvtepi32_ps(vInt);
3008    // RGB + 0, A + 0x80000000.f to undo the signed order.
3009    vTemp = _mm_add_ps(vTemp,g_XMFixAA8R8G8B8);
3010    // Convert 0-255 to 0.0f-1.0f
3011    return _mm_mul_ps(vTemp,g_XMNormalizeA8R8G8B8);
3012#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3013#endif // _XM_VMX128_INTRINSICS_
3014}
3015
3016//------------------------------------------------------------------------------
3017
3018XMFINLINE XMMATRIX XMLoadFloat3x3
3019(
3020    CONST XMFLOAT3X3* pSource
3021)
3022{
3023#if defined(_XM_NO_INTRINSICS_)
3024
3025    XMMATRIX M;
3026
3027    XMASSERT(pSource);
3028
3029    M.r[0].vector4_f32[0] = pSource->m[0][0];
3030    M.r[0].vector4_f32[1] = pSource->m[0][1];
3031    M.r[0].vector4_f32[2] = pSource->m[0][2];
3032    M.r[0].vector4_f32[3] = 0.0f;
3033
3034    M.r[1].vector4_f32[0] = pSource->m[1][0];
3035    M.r[1].vector4_f32[1] = pSource->m[1][1];
3036    M.r[1].vector4_f32[2] = pSource->m[1][2];
3037    M.r[1].vector4_f32[3] = 0.0f;
3038
3039    M.r[2].vector4_f32[0] = pSource->m[2][0];
3040    M.r[2].vector4_f32[1] = pSource->m[2][1];
3041    M.r[2].vector4_f32[2] = pSource->m[2][2];
3042    M.r[2].vector4_f32[3] = 0.0f;
3043
3044    M.r[3].vector4_f32[0] = 0.0f;
3045    M.r[3].vector4_f32[1] = 0.0f;
3046    M.r[3].vector4_f32[2] = 0.0f;
3047    M.r[3].vector4_f32[3] = 1.0f;
3048
3049    return M;
3050
3051#elif defined(_XM_SSE_INTRINSICS_)
3052    XMMATRIX M;
3053    XMVECTOR V1, V2, V3, Z, T1, T2, T3, T4, T5;
3054
3055    Z = _mm_setzero_ps();
3056
3057    XMASSERT(pSource);
3058
3059    V1 = _mm_loadu_ps( &pSource->m[0][0] );
3060    V2 = _mm_loadu_ps( &pSource->m[1][1] );
3061    V3 = _mm_load_ss( &pSource->m[2][2] );
3062
3063    T1 = _mm_unpackhi_ps( V1, Z );
3064    T2 = _mm_unpacklo_ps( V2, Z );
3065    T3 = _mm_shuffle_ps( V3, T2, _MM_SHUFFLE( 0, 1, 0, 0 ) );
3066    T4 = _mm_movehl_ps( T2, T3 );
3067    T5 = _mm_movehl_ps( Z, T1 );
3068
3069    M.r[0] = _mm_movelh_ps( V1, T1 );
3070    M.r[1] = _mm_add_ps( T4, T5 );
3071    M.r[2] = _mm_shuffle_ps( V2, V3, _MM_SHUFFLE(1, 0, 3, 2) );
3072    M.r[3] = g_XMIdentityR3;
3073
3074    return M;
3075#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3076#endif // _XM_VMX128_INTRINSICS_
3077}
3078
3079//------------------------------------------------------------------------------
3080
3081XMFINLINE XMMATRIX XMLoadFloat4x3
3082(
3083    CONST XMFLOAT4X3* pSource
3084)
3085{
3086#if defined(_XM_NO_INTRINSICS_)
3087    XMMATRIX M;
3088    XMASSERT(pSource);
3089
3090    ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
3091    ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
3092    ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
3093    M.r[0].vector4_f32[3] = 0.0f;
3094
3095    ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
3096    ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
3097    ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
3098    M.r[1].vector4_f32[3] = 0.0f;
3099
3100    ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
3101    ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
3102    ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
3103    M.r[2].vector4_f32[3] = 0.0f;
3104
3105    ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
3106    ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
3107    ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
3108    M.r[3].vector4_f32[3] = 1.0f;
3109
3110    return M;
3111
3112#elif defined(_XM_SSE_INTRINSICS_)
3113    XMASSERT(pSource);
3114    // Use unaligned load instructions to
3115    // load the 12 floats
3116    // vTemp1 = x1,y1,z1,x2
3117    XMVECTOR vTemp1 = _mm_loadu_ps(&pSource->m[0][0]);
3118    // vTemp2 = y2,z2,x3,y3
3119    XMVECTOR vTemp2 = _mm_loadu_ps(&pSource->m[1][1]);
3120    // vTemp4 = z3,x4,y4,z4
3121    XMVECTOR vTemp4 = _mm_loadu_ps(&pSource->m[2][2]);
3122    // vTemp3 = x3,y3,z3,z3
3123    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
3124    // vTemp2 = y2,z2,x2,x2
3125    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
3126    // vTemp2 = x2,y2,z2,z2
3127    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
3128    // vTemp1 = x1,y1,z1,0
3129    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
3130    // vTemp2 = x2,y2,z2,0
3131    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
3132    // vTemp3 = x3,y3,z3,0
3133    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
3134    // vTemp4i = x4,y4,z4,0
3135    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
3136    // vTemp4i = x4,y4,z4,1.0f
3137    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
3138    XMMATRIX M(vTemp1,
3139            vTemp2,
3140            vTemp3,
3141            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
3142    return M;
3143#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3144#endif // _XM_VMX128_INTRINSICS_
3145}
3146
3147//------------------------------------------------------------------------------
3148
3149XMFINLINE XMMATRIX XMLoadFloat4x3A
3150(
3151    CONST XMFLOAT4X3A* pSource
3152)
3153{
3154#if defined(_XM_NO_INTRINSICS_)
3155
3156    XMMATRIX M;
3157
3158    XMASSERT(pSource);
3159    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
3160
3161    M.r[0].vector4_f32[0] = pSource->m[0][0];
3162    M.r[0].vector4_f32[1] = pSource->m[0][1];
3163    M.r[0].vector4_f32[2] = pSource->m[0][2];
3164    M.r[0].vector4_f32[3] = 0.0f;
3165
3166    M.r[1].vector4_f32[0] = pSource->m[1][0];
3167    M.r[1].vector4_f32[1] = pSource->m[1][1];
3168    M.r[1].vector4_f32[2] = pSource->m[1][2];
3169    M.r[1].vector4_f32[3] = 0.0f;
3170
3171    M.r[2].vector4_f32[0] = pSource->m[2][0];
3172    M.r[2].vector4_f32[1] = pSource->m[2][1];
3173    M.r[2].vector4_f32[2] = pSource->m[2][2];
3174    M.r[2].vector4_f32[3] = 0.0f;
3175
3176    M.r[3].vector4_f32[0] = pSource->m[3][0];
3177    M.r[3].vector4_f32[1] = pSource->m[3][1];
3178    M.r[3].vector4_f32[2] = pSource->m[3][2];
3179    M.r[3].vector4_f32[3] = 1.0f;
3180
3181    return M;
3182
3183#elif defined(_XM_SSE_INTRINSICS_)
3184    XMASSERT(pSource);
3185    // Use aligned load instructions to
3186    // load the 12 floats
3187    // vTemp1 = x1,y1,z1,x2
3188    XMVECTOR vTemp1 = _mm_load_ps(&pSource->m[0][0]);
3189    // vTemp2 = y2,z2,x3,y3
3190    XMVECTOR vTemp2 = _mm_load_ps(&pSource->m[1][1]);
3191    // vTemp4 = z3,x4,y4,z4
3192    XMVECTOR vTemp4 = _mm_load_ps(&pSource->m[2][2]);
3193    // vTemp3 = x3,y3,z3,z3
3194    XMVECTOR vTemp3 = _mm_shuffle_ps(vTemp2,vTemp4,_MM_SHUFFLE(0,0,3,2));
3195    // vTemp2 = y2,z2,x2,x2
3196    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(3,3,1,0));
3197    // vTemp2 = x2,y2,z2,z2
3198    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp2,_MM_SHUFFLE(1,1,0,2));
3199    // vTemp1 = x1,y1,z1,0
3200    vTemp1 = _mm_and_ps(vTemp1,g_XMMask3);
3201    // vTemp2 = x2,y2,z2,0
3202    vTemp2 = _mm_and_ps(vTemp2,g_XMMask3);
3203    // vTemp3 = x3,y3,z3,0
3204    vTemp3 = _mm_and_ps(vTemp3,g_XMMask3);
3205    // vTemp4i = x4,y4,z4,0
3206    __m128i vTemp4i = _mm_srli_si128(reinterpret_cast<const __m128i *>(&vTemp4)[0],32/8);
3207    // vTemp4i = x4,y4,z4,1.0f
3208    vTemp4i = _mm_or_si128(vTemp4i,g_XMIdentityR3);
3209    XMMATRIX M(vTemp1,
3210            vTemp2,
3211            vTemp3,
3212            reinterpret_cast<const __m128 *>(&vTemp4i)[0]);
3213    return M;
3214#else // _XM_VMX128_INTRINSICS_
3215#endif // _XM_VMX128_INTRINSICS_
3216}
3217
3218//------------------------------------------------------------------------------
3219
3220XMFINLINE XMMATRIX XMLoadFloat4x4
3221(
3222    CONST XMFLOAT4X4* pSource
3223)
3224{
3225#if defined(_XM_NO_INTRINSICS_)
3226    XMMATRIX M;
3227    XMASSERT(pSource);
3228
3229    ((UINT *)(&M.r[0].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[0][0]))[0];
3230    ((UINT *)(&M.r[0].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[0][1]))[0];
3231    ((UINT *)(&M.r[0].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[0][2]))[0];
3232    ((UINT *)(&M.r[0].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[0][3]))[0];
3233
3234    ((UINT *)(&M.r[1].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[1][0]))[0];
3235    ((UINT *)(&M.r[1].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[1][1]))[0];
3236    ((UINT *)(&M.r[1].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[1][2]))[0];
3237    ((UINT *)(&M.r[1].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[1][3]))[0];
3238
3239    ((UINT *)(&M.r[2].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[2][0]))[0];
3240    ((UINT *)(&M.r[2].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[2][1]))[0];
3241    ((UINT *)(&M.r[2].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[2][2]))[0];
3242    ((UINT *)(&M.r[2].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[2][3]))[0];
3243
3244    ((UINT *)(&M.r[3].vector4_f32[0]))[0] = ((const UINT *)(&pSource->m[3][0]))[0];
3245    ((UINT *)(&M.r[3].vector4_f32[1]))[0] = ((const UINT *)(&pSource->m[3][1]))[0];
3246    ((UINT *)(&M.r[3].vector4_f32[2]))[0] = ((const UINT *)(&pSource->m[3][2]))[0];
3247    ((UINT *)(&M.r[3].vector4_f32[3]))[0] = ((const UINT *)(&pSource->m[3][3]))[0];
3248
3249    return M;
3250
3251#elif defined(_XM_SSE_INTRINSICS_)
3252    XMASSERT(pSource);
3253    XMMATRIX M;
3254
3255    M.r[0] = _mm_loadu_ps( &pSource->_11 );
3256    M.r[1] = _mm_loadu_ps( &pSource->_21 );
3257    M.r[2] = _mm_loadu_ps( &pSource->_31 );
3258    M.r[3] = _mm_loadu_ps( &pSource->_41 );
3259
3260    return M;
3261#elif defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
3262#endif // _XM_VMX128_INTRINSICS_
3263}
3264
3265//------------------------------------------------------------------------------
3266
3267XMFINLINE XMMATRIX XMLoadFloat4x4A
3268(
3269    CONST XMFLOAT4X4A* pSource
3270)
3271{
3272#if defined(_XM_NO_INTRINSICS_)
3273
3274    XMMATRIX M;
3275
3276    XMASSERT(pSource);
3277    XMASSERT(((UINT_PTR)pSource & 0xF) == 0);
3278
3279    M.r[0].vector4_f32[0] = pSource->m[0][0];
3280    M.r[0].vector4_f32[1] = pSource->m[0][1];
3281    M.r[0].vector4_f32[2] = pSource->m[0][2];
3282    M.r[0].vector4_f32[3] = pSource->m[0][3];
3283
3284    M.r[1].vector4_f32[0] = pSource->m[1][0];
3285    M.r[1].vector4_f32[1] = pSource->m[1][1];
3286    M.r[1].vector4_f32[2] = pSource->m[1][2];
3287    M.r[1].vector4_f32[3] = pSource->m[1][3];
3288
3289    M.r[2].vector4_f32[0] = pSource->m[2][0];
3290    M.r[2].vector4_f32[1] = pSource->m[2][1];
3291    M.r[2].vector4_f32[2] = pSource->m[2][2];
3292    M.r[2].vector4_f32[3] = pSource->m[2][3];
3293
3294    M.r[3].vector4_f32[0] = pSource->m[3][0];
3295    M.r[3].vector4_f32[1] = pSource->m[3][1];
3296    M.r[3].vector4_f32[2] = pSource->m[3][2];
3297    M.r[3].vector4_f32[3] = pSource->m[3][3];
3298
3299    return M;
3300
3301#elif defined(_XM_SSE_INTRINSICS_)
3302    XMMATRIX M;
3303
3304    XMASSERT(pSource);
3305
3306    M.r[0] = _mm_load_ps( &pSource->_11 );
3307    M.r[1] = _mm_load_ps( &pSource->_21 );
3308    M.r[2] = _mm_load_ps( &pSource->_31 );
3309    M.r[3] = _mm_load_ps( &pSource->_41 );
3310
3311    return M;
3312#else // _XM_VMX128_INTRINSICS_
3313#endif // _XM_VMX128_INTRINSICS_
3314}
3315
3316/****************************************************************************
3317 *
3318 * Vector and matrix store operations
3319 *
3320 ****************************************************************************/
3321
3322XMFINLINE VOID XMStoreInt
3323(
3324    UINT*    pDestination,
3325    FXMVECTOR V
3326)
3327{
3328#if defined(_XM_NO_INTRINSICS_)
3329
3330    XMASSERT(pDestination);
3331    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3332
3333    *pDestination = XMVectorGetIntX( V );
3334
3335#elif defined(_XM_SSE_INTRINSICS_)
3336    XMASSERT(pDestination);
3337    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3338
3339    _mm_store_ss( (float*)pDestination, V );
3340#else // _XM_VMX128_INTRINSICS_
3341#endif // _XM_VMX128_INTRINSICS_
3342}
3343
3344//------------------------------------------------------------------------------
3345
3346XMFINLINE VOID XMStoreFloat
3347(
3348    FLOAT*    pDestination,
3349    FXMVECTOR V
3350)
3351{
3352#if defined(_XM_NO_INTRINSICS_)
3353
3354    XMASSERT(pDestination);
3355    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3356
3357    *pDestination = XMVectorGetX( V );
3358
3359#elif defined(_XM_SSE_INTRINSICS_)
3360    XMASSERT(pDestination);
3361    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3362
3363    _mm_store_ss( pDestination, V );
3364#else // _XM_VMX128_INTRINSICS_
3365#endif // _XM_VMX128_INTRINSICS_
3366}
3367
3368//------------------------------------------------------------------------------
3369
3370XMFINLINE VOID XMStoreInt2
3371(
3372    UINT*    pDestination,
3373    FXMVECTOR V
3374)
3375{
3376#if defined(_XM_NO_INTRINSICS_)
3377
3378    XMASSERT(pDestination);
3379    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3380
3381    pDestination[0] = V.vector4_u32[0];
3382    pDestination[1] = V.vector4_u32[1];
3383
3384#elif defined(_XM_SSE_INTRINSICS_)
3385    XMASSERT(pDestination);
3386    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3387
3388    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3389    _mm_store_ss( (float*)&pDestination[0], V );
3390    _mm_store_ss( (float*)&pDestination[1], T );
3391#else // _XM_VMX128_INTRINSICS_
3392#endif // _XM_VMX128_INTRINSICS_
3393}
3394
3395//------------------------------------------------------------------------------
3396
3397XMFINLINE VOID XMStoreSInt2
3398(
3399    XMINT2* pDestination,
3400    FXMVECTOR V
3401)
3402{
3403#if defined(_XM_NO_INTRINSICS_)
3404
3405    XMASSERT(pDestination);
3406    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3407
3408    pDestination->x = (INT)V.vector4_f32[0];
3409    pDestination->y = (INT)V.vector4_f32[1];
3410
3411#elif defined(_XM_SSE_INTRINSICS_)
3412    XMASSERT(pDestination);
3413    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3414
3415    // In case of positive overflow, detect it
3416    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
3417    // Float to int conversion
3418    __m128i vResulti = _mm_cvttps_epi32(V);
3419    // If there was positive overflow, set to 0x7FFFFFFF
3420    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
3421    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
3422    vOverflow = _mm_or_ps(vOverflow,vResult);
3423    // Write two ints
3424    XMVECTOR T = _mm_shuffle_ps( vOverflow, vOverflow, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3425    _mm_store_ss( (float*)&pDestination->x, vOverflow );
3426    _mm_store_ss( (float*)&pDestination->y, T );
3427#else // _XM_VMX128_INTRINSICS_
3428#endif // _XM_VMX128_INTRINSICS_
3429}
3430
3431//------------------------------------------------------------------------------
3432
3433XMFINLINE VOID XMStoreUInt2
3434(
3435    XMUINT2* pDestination,
3436    FXMVECTOR V
3437)
3438{
3439#if defined(_XM_NO_INTRINSICS_)
3440
3441    XMASSERT(pDestination);
3442    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3443
3444    pDestination->x = (UINT)V.vector4_f32[0];
3445    pDestination->y = (UINT)V.vector4_f32[1];
3446
3447#elif defined(_XM_SSE_INTRINSICS_)
3448    XMASSERT(pDestination);
3449    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3450
3451    // Clamp to >=0
3452    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3453    // Any numbers that are too big, set to 0xFFFFFFFFU
3454    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
3455    XMVECTOR vValue = g_XMUnsignedFix;
3456    // Too large for a signed integer?
3457    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
3458    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
3459    vValue = _mm_and_ps(vValue,vMask);
3460    // Perform fixup only on numbers too large (Keeps low bit precision)
3461    vResult = _mm_sub_ps(vResult,vValue);
3462    __m128i vResulti = _mm_cvttps_epi32(vResult);
3463    // Convert from signed to unsigned pnly if greater than 0x80000000
3464    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
3465    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
3466    // On those that are too large, set to 0xFFFFFFFF
3467    vResult = _mm_or_ps(vResult,vOverflow);
3468    // Write two uints
3469    XMVECTOR T = _mm_shuffle_ps( vResult, vResult, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3470    _mm_store_ss( (float*)&pDestination->x, vResult );
3471    _mm_store_ss( (float*)&pDestination->y, T );
3472#else // _XM_VMX128_INTRINSICS_
3473#endif // _XM_VMX128_INTRINSICS_
3474}
3475
3476//------------------------------------------------------------------------------
3477
3478XMFINLINE VOID XMStoreInt2A
3479(
3480    UINT*    pDestination,
3481    FXMVECTOR V
3482)
3483{
3484#if defined(_XM_NO_INTRINSICS_)
3485
3486    XMASSERT(pDestination);
3487    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3488
3489    pDestination[0] = V.vector4_u32[0];
3490    pDestination[1] = V.vector4_u32[1];
3491
3492#elif defined(_XM_SSE_INTRINSICS_)
3493
3494    XMASSERT(pDestination);
3495    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3496
3497    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3498
3499#else // _XM_VMX128_INTRINSICS_
3500#endif // _XM_VMX128_INTRINSICS_
3501}
3502
3503//------------------------------------------------------------------------------
3504
3505XMFINLINE VOID XMStoreFloat2
3506(
3507    XMFLOAT2* pDestination,
3508    FXMVECTOR  V
3509)
3510{
3511#if defined(_XM_NO_INTRINSICS_)
3512
3513    XMASSERT(pDestination);
3514    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3515
3516    pDestination->x = V.vector4_f32[0];
3517    pDestination->y = V.vector4_f32[1];
3518
3519#elif defined(_XM_SSE_INTRINSICS_)
3520    XMASSERT(pDestination);
3521    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3522
3523    XMVECTOR T = _mm_shuffle_ps( V, V, _MM_SHUFFLE( 1, 1, 1, 1 ) );
3524    _mm_store_ss( &pDestination->x, V );
3525    _mm_store_ss( &pDestination->y, T );
3526#else // _XM_VMX128_INTRINSICS_
3527#endif // _XM_VMX128_INTRINSICS_
3528}
3529
3530//------------------------------------------------------------------------------
3531
3532XMFINLINE VOID XMStoreFloat2A
3533(
3534    XMFLOAT2A*   pDestination,
3535    FXMVECTOR     V
3536)
3537{
3538#if defined(_XM_NO_INTRINSICS_)
3539
3540    XMASSERT(pDestination);
3541    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3542
3543    pDestination->x = V.vector4_f32[0];
3544    pDestination->y = V.vector4_f32[1];
3545
3546#elif defined(_XM_SSE_INTRINSICS_)
3547
3548    XMASSERT(pDestination);
3549    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3550
3551    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3552
3553#else // _XM_VMX128_INTRINSICS_
3554#endif // _XM_VMX128_INTRINSICS_
3555}
3556
3557//------------------------------------------------------------------------------
3558
3559XMFINLINE VOID XMStoreHalf2
3560(
3561    XMHALF2* pDestination,
3562    FXMVECTOR V
3563)
3564{
3565#if defined(_XM_NO_INTRINSICS_)
3566
3567    XMASSERT(pDestination);
3568
3569    pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
3570    pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
3571
3572#elif defined(_XM_SSE_INTRINSICS_)
3573    XMASSERT(pDestination);
3574    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
3575    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
3576#else // _XM_VMX128_INTRINSICS_
3577#endif // _XM_VMX128_INTRINSICS_
3578}
3579
3580//------------------------------------------------------------------------------
3581
3582XMFINLINE VOID XMStoreShortN2
3583(
3584    XMSHORTN2* pDestination,
3585    FXMVECTOR   V
3586)
3587{
3588#if defined(_XM_NO_INTRINSICS_)
3589
3590    XMVECTOR N;
3591    static CONST XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3592
3593    XMASSERT(pDestination);
3594
3595    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3596    N = XMVectorMultiply(N, Scale.v);
3597    N = XMVectorRound(N);
3598
3599    pDestination->x = (SHORT)N.vector4_f32[0];
3600    pDestination->y = (SHORT)N.vector4_f32[1];
3601
3602#elif defined(_XM_SSE_INTRINSICS_)
3603    XMASSERT(pDestination);
3604    static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3605
3606    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
3607    vResult = _mm_min_ps(vResult,g_XMOne);
3608    vResult = _mm_mul_ps(vResult,Scale);
3609    __m128i vResulti = _mm_cvtps_epi32(vResult);
3610    vResulti = _mm_packs_epi32(vResulti,vResulti);
3611    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vResulti)[0]);
3612#else // _XM_VMX128_INTRINSICS_
3613#endif // _XM_VMX128_INTRINSICS_
3614}
3615
3616//------------------------------------------------------------------------------
3617
3618XMFINLINE VOID XMStoreShort2
3619(
3620    XMSHORT2* pDestination,
3621    FXMVECTOR  V
3622)
3623{
3624#if defined(_XM_NO_INTRINSICS_)
3625
3626    XMVECTOR               N;
3627    static CONST XMVECTOR  Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3628    static CONST XMVECTOR  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3629
3630    XMASSERT(pDestination);
3631
3632    N = XMVectorClamp(V, Min, Max);
3633    N = XMVectorRound(N);
3634
3635    pDestination->x = (SHORT)N.vector4_f32[0];
3636    pDestination->y = (SHORT)N.vector4_f32[1];
3637
3638#elif defined(_XM_SSE_INTRINSICS_)
3639    XMASSERT(pDestination);
3640    static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
3641    static CONST XMVECTORF32 Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
3642    // Bounds check
3643    XMVECTOR vResult = _mm_max_ps(V,Min);
3644    vResult = _mm_min_ps(vResult,Max);
3645     // Convert to int with rounding
3646    __m128i vInt = _mm_cvtps_epi32(vResult);
3647    // Pack the ints into shorts
3648    vInt = _mm_packs_epi32(vInt,vInt);
3649    _mm_store_ss(reinterpret_cast<float *>(&pDestination->x),reinterpret_cast<const __m128 *>(&vInt)[0]);
3650#else // _XM_VMX128_INTRINSICS_
3651#endif // _XM_VMX128_INTRINSICS_
3652}
3653
3654//------------------------------------------------------------------------------
3655
3656XMFINLINE VOID XMStoreUShortN2
3657(
3658    XMUSHORTN2* pDestination,
3659    FXMVECTOR    V
3660)
3661{
3662#if defined(_XM_NO_INTRINSICS_)
3663
3664    XMVECTOR               N;
3665    static CONST XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3666
3667    XMASSERT(pDestination);
3668
3669    N = XMVectorSaturate(V);
3670    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
3671    N = XMVectorTruncate(N);
3672
3673    pDestination->x = (SHORT)N.vector4_f32[0];
3674    pDestination->y = (SHORT)N.vector4_f32[1];
3675
3676#elif defined(_XM_SSE_INTRINSICS_)
3677    XMASSERT(pDestination);
3678    static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3679    // Bounds check
3680    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3681    vResult = _mm_min_ps(vResult,g_XMOne);
3682    vResult = _mm_mul_ps(vResult,Scale);
3683     // Convert to int with rounding
3684    __m128i vInt = _mm_cvtps_epi32(vResult);
3685    // Since the SSE pack instruction clamps using signed rules,
3686    // manually extract the values to store them to memory
3687    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3688    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3689#else // _XM_VMX128_INTRINSICS_
3690#endif // _XM_VMX128_INTRINSICS_
3691}
3692
3693//------------------------------------------------------------------------------
3694
3695XMFINLINE VOID XMStoreUShort2
3696(
3697    XMUSHORT2* pDestination,
3698    FXMVECTOR   V
3699)
3700{
3701#if defined(_XM_NO_INTRINSICS_)
3702
3703    XMVECTOR               N;
3704    static CONST XMVECTOR  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3705
3706    XMASSERT(pDestination);
3707
3708    N = XMVectorClamp(V, XMVectorZero(), Max);
3709    N = XMVectorRound(N);
3710
3711    pDestination->x = (SHORT)N.vector4_f32[0];
3712    pDestination->y = (SHORT)N.vector4_f32[1];
3713
3714#elif defined(_XM_SSE_INTRINSICS_)
3715    XMASSERT(pDestination);
3716    static CONST XMVECTORF32  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
3717    // Bounds check
3718    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3719    vResult = _mm_min_ps(vResult,Max);
3720     // Convert to int with rounding
3721    __m128i vInt = _mm_cvtps_epi32(vResult);
3722    // Since the SSE pack instruction clamps using signed rules,
3723    // manually extract the values to store them to memory
3724    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
3725    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
3726#else // _XM_VMX128_INTRINSICS_
3727#endif // _XM_VMX128_INTRINSICS_
3728}
3729
3730//------------------------------------------------------------------------------
3731
3732XMFINLINE VOID XMStoreByteN2
3733(
3734    XMBYTEN2* pDestination,
3735    FXMVECTOR   V
3736)
3737{
3738    XMVECTOR N;
3739    XMFLOAT4A tmp;
3740    static CONST XMVECTORF32  Scale = {127.0f, 127.0f, 127.0f, 127.0f};
3741
3742    XMASSERT(pDestination);
3743
3744    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
3745    N = XMVectorMultiply(N, Scale.v);
3746    N = XMVectorRound(N);
3747
3748    XMStoreFloat4A( &tmp, N );
3749
3750    pDestination->x = (CHAR)tmp.x;
3751    pDestination->y = (CHAR)tmp.y;
3752}
3753
3754//------------------------------------------------------------------------------
3755
3756XMFINLINE VOID XMStoreByte2
3757(
3758    XMBYTE2* pDestination,
3759    FXMVECTOR  V
3760)
3761{
3762    XMVECTOR               N;
3763    XMFLOAT4A              tmp;
3764    static CONST XMVECTOR  Min = {-127.0f, -127.0f, -127.0f, -127.0f};
3765    static CONST XMVECTOR  Max = {127.0f, 127.0f, 127.0f, 127.0f};
3766
3767    XMASSERT(pDestination);
3768
3769    N = XMVectorClamp(V, Min, Max);
3770    N = XMVectorRound(N);
3771
3772    XMStoreFloat4A( &tmp, N );
3773
3774    pDestination->x = (CHAR)tmp.x;
3775    pDestination->y = (CHAR)tmp.y;
3776}
3777
3778//------------------------------------------------------------------------------
3779
3780XMFINLINE VOID XMStoreUByteN2
3781(
3782    XMUBYTEN2* pDestination,
3783    FXMVECTOR    V
3784)
3785{
3786    XMVECTOR               N;
3787    XMFLOAT4A              tmp;
3788    static CONST XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
3789
3790    XMASSERT(pDestination);
3791
3792    N = XMVectorSaturate(V);
3793    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
3794    N = XMVectorTruncate(N);
3795
3796    XMStoreFloat4A( &tmp, N );
3797
3798    pDestination->x = (BYTE)tmp.x;
3799    pDestination->y = (BYTE)tmp.y;
3800}
3801
3802//------------------------------------------------------------------------------
3803
3804XMFINLINE VOID XMStoreUByte2
3805(
3806    XMUBYTE2* pDestination,
3807    FXMVECTOR   V
3808)
3809{
3810    XMVECTOR               N;
3811    static CONST XMVECTOR  Max = {255.0f, 255.0f, 255.0f, 255.0f};
3812    XMFLOAT4A              tmp;
3813
3814    XMASSERT(pDestination);
3815
3816    N = XMVectorClamp(V, XMVectorZero(), Max);
3817    N = XMVectorRound(N);
3818
3819    XMStoreFloat4A( &tmp, N );
3820
3821    pDestination->x = (BYTE)tmp.x;
3822    pDestination->y = (BYTE)tmp.y;
3823}
3824
3825//------------------------------------------------------------------------------
3826
3827XMFINLINE VOID XMStoreInt3
3828(
3829    UINT*    pDestination,
3830    FXMVECTOR V
3831)
3832{
3833#if defined(_XM_NO_INTRINSICS_)
3834
3835    XMASSERT(pDestination);
3836    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3837
3838    pDestination[0] = V.vector4_u32[0];
3839    pDestination[1] = V.vector4_u32[1];
3840    pDestination[2] = V.vector4_u32[2];
3841
3842#elif defined(_XM_SSE_INTRINSICS_)
3843
3844    XMASSERT(pDestination);
3845    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3846
3847    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
3848    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
3849    _mm_store_ss( (float*)pDestination, V );
3850    _mm_store_ss( (float*)&pDestination[1], T1 );
3851    _mm_store_ss( (float*)&pDestination[2], T2 );
3852
3853#else // _XM_VMX128_INTRINSICS_
3854#endif // _XM_VMX128_INTRINSICS_
3855}
3856
3857//------------------------------------------------------------------------------
3858
3859XMFINLINE VOID XMStoreSInt3
3860(
3861    XMINT3* pDestination,
3862    FXMVECTOR V
3863)
3864{
3865#if defined(_XM_NO_INTRINSICS_)
3866
3867    XMASSERT(pDestination);
3868    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3869
3870    pDestination->x = (INT)V.vector4_f32[0];
3871    pDestination->y = (INT)V.vector4_f32[1];
3872    pDestination->z = (INT)V.vector4_f32[2];
3873
3874#elif defined(_XM_SSE_INTRINSICS_)
3875
3876    XMASSERT(pDestination);
3877    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3878
3879    // In case of positive overflow, detect it
3880    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
3881    // Float to int conversion
3882    __m128i vResulti = _mm_cvttps_epi32(V);
3883    // If there was positive overflow, set to 0x7FFFFFFF
3884    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
3885    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
3886    vOverflow = _mm_or_ps(vOverflow,vResult);
3887    // Write 3 uints
3888    XMVECTOR T1 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(1,1,1,1));
3889    XMVECTOR T2 = _mm_shuffle_ps(vOverflow,vOverflow,_MM_SHUFFLE(2,2,2,2));
3890    _mm_store_ss( (float*)&pDestination->x, vOverflow );
3891    _mm_store_ss( (float*)&pDestination->y, T1 );
3892    _mm_store_ss( (float*)&pDestination->z, T2 );
3893
3894#else // _XM_VMX128_INTRINSICS_
3895#endif // _XM_VMX128_INTRINSICS_
3896}
3897
3898//------------------------------------------------------------------------------
3899
3900XMFINLINE VOID XMStoreUInt3
3901(
3902    XMUINT3* pDestination,
3903    FXMVECTOR V
3904)
3905{
3906#if defined(_XM_NO_INTRINSICS_)
3907
3908    XMASSERT(pDestination);
3909    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3910
3911    pDestination->x = (UINT)V.vector4_f32[0];
3912    pDestination->y = (UINT)V.vector4_f32[1];
3913    pDestination->z = (UINT)V.vector4_f32[2];
3914
3915#elif defined(_XM_SSE_INTRINSICS_)
3916
3917    XMASSERT(pDestination);
3918    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3919
3920    // Clamp to >=0
3921    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
3922    // Any numbers that are too big, set to 0xFFFFFFFFU
3923    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
3924    XMVECTOR vValue = g_XMUnsignedFix;
3925    // Too large for a signed integer?
3926    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
3927    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
3928    vValue = _mm_and_ps(vValue,vMask);
3929    // Perform fixup only on numbers too large (Keeps low bit precision)
3930    vResult = _mm_sub_ps(vResult,vValue);
3931    __m128i vResulti = _mm_cvttps_epi32(vResult);
3932    // Convert from signed to unsigned pnly if greater than 0x80000000
3933    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
3934    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
3935    // On those that are too large, set to 0xFFFFFFFF
3936    vResult = _mm_or_ps(vResult,vOverflow);
3937    // Write 3 uints
3938    XMVECTOR T1 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(1,1,1,1));
3939    XMVECTOR T2 = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(2,2,2,2));
3940    _mm_store_ss( (float*)&pDestination->x, vResult );
3941    _mm_store_ss( (float*)&pDestination->y, T1 );
3942    _mm_store_ss( (float*)&pDestination->z, T2 );
3943
3944#else // _XM_VMX128_INTRINSICS_
3945#endif // _XM_VMX128_INTRINSICS_
3946}
3947
3948//------------------------------------------------------------------------------
3949
3950XMFINLINE VOID XMStoreInt3A
3951(
3952    UINT*    pDestination,
3953    FXMVECTOR V
3954)
3955{
3956#if defined(_XM_NO_INTRINSICS_)
3957
3958    XMASSERT(pDestination);
3959    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3960
3961    pDestination[0] = V.vector4_u32[0];
3962    pDestination[1] = V.vector4_u32[1];
3963    pDestination[2] = V.vector4_u32[2];
3964
3965#elif defined(_XM_SSE_INTRINSICS_)
3966
3967    XMASSERT(pDestination);
3968    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
3969
3970    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
3971    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
3972    _mm_store_ss( (float*)&pDestination[2], T );
3973
3974#else // _XM_VMX128_INTRINSICS_
3975#endif // _XM_VMX128_INTRINSICS_
3976}
3977
3978//------------------------------------------------------------------------------
3979
3980XMFINLINE VOID XMStoreFloat3
3981(
3982    XMFLOAT3* pDestination,
3983    FXMVECTOR V
3984)
3985{
3986#if defined(_XM_NO_INTRINSICS_)
3987
3988    XMASSERT(pDestination);
3989    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3990
3991    pDestination->x = V.vector4_f32[0];
3992    pDestination->y = V.vector4_f32[1];
3993    pDestination->z = V.vector4_f32[2];
3994
3995#elif defined(_XM_SSE_INTRINSICS_)
3996
3997    XMASSERT(pDestination);
3998    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
3999
4000    XMVECTOR T1 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(1,1,1,1));
4001    XMVECTOR T2 = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
4002    _mm_store_ss( &pDestination->x, V );
4003    _mm_store_ss( &pDestination->y, T1 );
4004    _mm_store_ss( &pDestination->z, T2 );
4005
4006#else // _XM_VMX128_INTRINSICS_
4007#endif // _XM_VMX128_INTRINSICS_
4008}
4009
4010//------------------------------------------------------------------------------
4011
4012XMFINLINE VOID XMStoreFloat3A
4013(
4014    XMFLOAT3A*   pDestination,
4015    FXMVECTOR     V
4016)
4017{
4018#if defined(_XM_NO_INTRINSICS_)
4019
4020    XMASSERT(pDestination);
4021    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4022
4023    pDestination->x = V.vector4_f32[0];
4024    pDestination->y = V.vector4_f32[1];
4025    pDestination->z = V.vector4_f32[2];
4026
4027#elif defined(_XM_SSE_INTRINSICS_)
4028
4029    XMASSERT(pDestination);
4030    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4031
4032    XMVECTOR T = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,2,2,2));
4033    _mm_storel_epi64( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4034    _mm_store_ss( &pDestination->z, T );
4035
4036#else // _XM_VMX128_INTRINSICS_
4037#endif // _XM_VMX128_INTRINSICS_
4038}
4039
4040//------------------------------------------------------------------------------
4041
4042XMFINLINE VOID XMStoreUHenDN3
4043(
4044    XMUHENDN3* pDestination,
4045    FXMVECTOR   V
4046)
4047{
4048#if defined(_XM_NO_INTRINSICS_)
4049
4050    XMVECTOR               N;
4051    static CONST XMVECTORF32  Scale = {2047.0f, 2047.0f, 1023.0f, 0.0f};
4052
4053    XMASSERT(pDestination);
4054
4055    N = XMVectorSaturate(V);
4056    N = XMVectorMultiply(N, Scale.v);
4057
4058    pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
4059                      (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
4060                      (((UINT)N.vector4_f32[0] & 0x7FF));
4061
4062#elif defined(_XM_SSE_INTRINSICS_)
4063    XMASSERT(pDestination);
4064    static const XMVECTORF32 ScaleUHenDN3 = {2047.0f, 2047.0f*2048.0f,1023.0f*(2048.0f*2048.0f)/2.0f,1.0f};
4065    static const XMVECTORI32 MaskUHenDN3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
4066    // Clamp to bounds
4067    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4068    vResult = _mm_min_ps(vResult,g_XMOne);
4069    // Scale by multiplication
4070    vResult = _mm_mul_ps(vResult,ScaleUHenDN3);
4071    // Convert to int
4072    __m128i vResulti = _mm_cvttps_epi32(vResult);
4073    // Mask off any fraction
4074    vResulti = _mm_and_si128(vResulti,MaskUHenDN3);
4075    // Do a horizontal or of 3 entries
4076    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4077    // i = x|y
4078    vResulti = _mm_or_si128(vResulti,vResulti2);
4079    // Move Z to the x position
4080    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4081    // Add Z to itself to perform a single bit left shift
4082    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4083    // i = x|y|z
4084    vResulti = _mm_or_si128(vResulti,vResulti2);
4085    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4086#else // _XM_VMX128_INTRINSICS_
4087#endif // _XM_VMX128_INTRINSICS_
4088}
4089
4090//------------------------------------------------------------------------------
4091
4092XMFINLINE VOID XMStoreUHenD3
4093(
4094    XMUHEND3* pDestination,
4095    FXMVECTOR  V
4096)
4097{
4098#if defined(_XM_NO_INTRINSICS_)
4099
4100    XMVECTOR               N;
4101    static CONST XMVECTOR  Max = {2047.0f, 2047.0f, 1023.0f, 0.0f};
4102
4103    XMASSERT(pDestination);
4104
4105    N = XMVectorClamp(V, XMVectorZero(), Max);
4106
4107    pDestination->v = (((UINT)N.vector4_f32[2] & 0x3FF) << 22) |
4108                      (((UINT)N.vector4_f32[1] & 0x7FF) << 11) |
4109                      (((UINT)N.vector4_f32[0] & 0x7FF));
4110
4111#elif defined(_XM_SSE_INTRINSICS_)
4112    XMASSERT(pDestination);
4113    static const XMVECTORF32 MaxUHenD3 = { 2047.0f, 2047.0f, 1023.0f, 1.0f};
4114    static const XMVECTORF32 ScaleUHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f)/2.0f,1.0f};
4115    static const XMVECTORI32 MaskUHenD3 = {0x7FF,0x7FF<<11,0x3FF<<(22-1),0};
4116    // Clamp to bounds
4117    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4118    vResult = _mm_min_ps(vResult,MaxUHenD3);
4119    // Scale by multiplication
4120    vResult = _mm_mul_ps(vResult,ScaleUHenD3);
4121    // Convert to int
4122    __m128i vResulti = _mm_cvttps_epi32(vResult);
4123    // Mask off any fraction
4124    vResulti = _mm_and_si128(vResulti,MaskUHenD3);
4125    // Do a horizontal or of 3 entries
4126    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4127    // i = x|y
4128    vResulti = _mm_or_si128(vResulti,vResulti2);
4129    // Move Z to the x position
4130    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4131    // Add Z to itself to perform a single bit left shift
4132    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4133    // i = x|y|z
4134    vResulti = _mm_or_si128(vResulti,vResulti2);
4135    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4136#else // _XM_VMX128_INTRINSICS_
4137#endif // _XM_VMX128_INTRINSICS_
4138}
4139
4140//------------------------------------------------------------------------------
4141
4142XMFINLINE VOID XMStoreHenDN3
4143(
4144    XMHENDN3* pDestination,
4145    FXMVECTOR V
4146)
4147{
4148#if defined(_XM_NO_INTRINSICS_)
4149
4150    XMVECTOR               N;
4151    static CONST XMVECTORF32  Scale = {1023.0f, 1023.0f, 511.0f, 1.0f};
4152
4153    XMASSERT(pDestination);
4154
4155    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4156    N = XMVectorMultiply(N, Scale.v);
4157
4158    pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
4159                      (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
4160                      (((INT)N.vector4_f32[0] & 0x7FF));
4161
4162#elif defined(_XM_SSE_INTRINSICS_)
4163    XMASSERT(pDestination);
4164    static const XMVECTORF32 ScaleHenDN3 = {1023.0f, 1023.0f*2048.0f,511.0f*(2048.0f*2048.0f),1.0f};
4165    // Clamp to bounds
4166    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4167    vResult = _mm_min_ps(vResult,g_XMOne);
4168    // Scale by multiplication
4169    vResult = _mm_mul_ps(vResult,ScaleHenDN3);
4170    // Convert to int
4171    __m128i vResulti = _mm_cvttps_epi32(vResult);
4172    // Mask off any fraction
4173    vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
4174    // Do a horizontal or of all 4 entries
4175    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4176    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4177    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4178    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4179    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4180#else // _XM_VMX128_INTRINSICS_
4181#endif // _XM_VMX128_INTRINSICS_
4182}
4183
4184//------------------------------------------------------------------------------
4185
4186XMFINLINE VOID XMStoreHenD3
4187(
4188    XMHEND3* pDestination,
4189    FXMVECTOR V
4190)
4191{
4192#if defined(_XM_NO_INTRINSICS_)
4193
4194    XMVECTOR               N;
4195    static CONST XMVECTOR  Min = {-1023.0f, -1023.0f, -511.0f, -1.0f};
4196    static CONST XMVECTOR  Max = {1023.0f, 1023.0f, 511.0f, 1.0f};
4197
4198    XMASSERT(pDestination);
4199
4200    N = XMVectorClamp(V, Min, Max);
4201
4202    pDestination->v = (((INT)N.vector4_f32[2] & 0x3FF) << 22) |
4203                      (((INT)N.vector4_f32[1] & 0x7FF) << 11) |
4204                      (((INT)N.vector4_f32[0] & 0x7FF));
4205
4206#elif defined(_XM_SSE_INTRINSICS_)
4207    XMASSERT(pDestination);
4208    static const XMVECTORF32 MinHenD3 = {-1023.0f,-1023.0f,-511.0f,-1.0f};
4209    static const XMVECTORF32 MaxHenD3 = { 1023.0f, 1023.0f, 511.0f, 1.0f};
4210    static const XMVECTORF32 ScaleHenD3 = {1.0f, 2048.0f,(2048.0f*2048.0f),1.0f};
4211    // Clamp to bounds
4212    XMVECTOR vResult = _mm_max_ps(V,MinHenD3);
4213    vResult = _mm_min_ps(vResult,MaxHenD3);
4214    // Scale by multiplication
4215    vResult = _mm_mul_ps(vResult,ScaleHenD3);
4216    // Convert to int
4217    __m128i vResulti = _mm_cvttps_epi32(vResult);
4218    // Mask off any fraction
4219    vResulti = _mm_and_si128(vResulti,g_XMMaskHenD3);
4220    // Do a horizontal or of all 4 entries
4221    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4222    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4223    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4224    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4225    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4226#else // _XM_VMX128_INTRINSICS_
4227#endif // _XM_VMX128_INTRINSICS_
4228}
4229
4230//------------------------------------------------------------------------------
4231
4232XMFINLINE VOID XMStoreUDHenN3
4233(
4234    XMUDHENN3* pDestination,
4235    FXMVECTOR   V
4236)
4237{
4238#if defined(_XM_NO_INTRINSICS_)
4239
4240    XMVECTOR               N;
4241    static CONST XMVECTORF32  Scale = {1023.0f, 2047.0f, 2047.0f, 0.0f};
4242
4243    XMASSERT(pDestination);
4244
4245    N = XMVectorSaturate(V);
4246    N = XMVectorMultiply(N, Scale.v);
4247
4248    pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
4249                      (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
4250                      (((UINT)N.vector4_f32[0] & 0x3FF));
4251
4252#elif defined(_XM_SSE_INTRINSICS_)
4253    XMASSERT(pDestination);
4254    static const XMVECTORF32 ScaleUDHenN3 = {1023.0f,2047.0f*1024.0f,2047.0f*(1024.0f*2048.0f)/2.0f,1.0f};
4255    static const XMVECTORI32 MaskUDHenN3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
4256    // Clamp to bounds
4257    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4258    vResult = _mm_min_ps(vResult,g_XMOne);
4259    // Scale by multiplication
4260    vResult = _mm_mul_ps(vResult,ScaleUDHenN3);
4261    // Convert to int
4262    __m128i vResulti = _mm_cvttps_epi32(vResult);
4263    // Mask off any fraction
4264    vResulti = _mm_and_si128(vResulti,MaskUDHenN3);
4265    // Do a horizontal or of 3 entries
4266    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4267    // i = x|y
4268    vResulti = _mm_or_si128(vResulti,vResulti2);
4269    // Move Z to the x position
4270    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4271    // Add Z to itself to perform a single bit left shift
4272    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4273    // i = x|y|z
4274    vResulti = _mm_or_si128(vResulti,vResulti2);
4275    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4276#else // _XM_VMX128_INTRINSICS_
4277#endif // _XM_VMX128_INTRINSICS_
4278}
4279
4280//------------------------------------------------------------------------------
4281
4282XMFINLINE VOID XMStoreUDHen3
4283(
4284    XMUDHEN3* pDestination,
4285    FXMVECTOR  V
4286)
4287{
4288#if defined(_XM_NO_INTRINSICS_)
4289
4290    XMVECTOR               N;
4291    static CONST XMVECTOR  Max = {1023.0f, 2047.0f, 2047.0f, 0.0f};
4292
4293    XMASSERT(pDestination);
4294
4295    N = XMVectorClamp(V, XMVectorZero(), Max);
4296
4297    pDestination->v = (((UINT)N.vector4_f32[2] & 0x7FF) << 21) |
4298                      (((UINT)N.vector4_f32[1] & 0x7FF) << 10) |
4299                      (((UINT)N.vector4_f32[0] & 0x3FF));
4300
4301#elif defined(_XM_SSE_INTRINSICS_)
4302    XMASSERT(pDestination);
4303    static const XMVECTORF32 MaxUDHen3 = { 1023.0f, 2047.0f, 2047.0f, 1.0f};
4304    static const XMVECTORF32 ScaleUDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f)/2.0f,1.0f};
4305    static const XMVECTORI32 MaskUDHen3 = {0x3FF,0x7FF<<10,0x7FF<<(21-1),0};
4306    // Clamp to bounds
4307    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4308    vResult = _mm_min_ps(vResult,MaxUDHen3);
4309    // Scale by multiplication
4310    vResult = _mm_mul_ps(vResult,ScaleUDHen3);
4311    // Convert to int
4312    __m128i vResulti = _mm_cvttps_epi32(vResult);
4313    // Mask off any fraction
4314    vResulti = _mm_and_si128(vResulti,MaskUDHen3);
4315    // Do a horizontal or of 3 entries
4316    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(0,3,2,1));
4317    // i = x|y
4318    vResulti = _mm_or_si128(vResulti,vResulti2);
4319    // Move Z to the x position
4320    vResulti2 = _mm_shuffle_epi32(vResulti2,_MM_SHUFFLE(0,3,2,1));
4321    // Add Z to itself to perform a single bit left shift
4322    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
4323    // i = x|y|z
4324    vResulti = _mm_or_si128(vResulti,vResulti2);
4325    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4326#else // _XM_VMX128_INTRINSICS_
4327#endif // _XM_VMX128_INTRINSICS_
4328}
4329
4330//------------------------------------------------------------------------------
4331
4332XMFINLINE VOID XMStoreDHenN3
4333(
4334    XMDHENN3* pDestination,
4335    FXMVECTOR V
4336)
4337{
4338#if defined(_XM_NO_INTRINSICS_)
4339
4340    XMVECTOR               N;
4341    static CONST XMVECTORF32  Scale = {511.0f, 1023.0f, 1023.0f, 1.0f};
4342
4343    XMASSERT(pDestination);
4344
4345    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4346    N = XMVectorMultiply(N, Scale.v);
4347
4348    pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
4349                      (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
4350                      (((INT)N.vector4_f32[0] & 0x3FF));
4351
4352#elif defined(_XM_SSE_INTRINSICS_)
4353    XMASSERT(pDestination);
4354    static const XMVECTORF32 ScaleDHenN3 = {511.0f, 1023.0f*1024.0f,1023.0f*(1024.0f*2048.0f),1.0f};
4355    // Clamp to bounds
4356    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4357    vResult = _mm_min_ps(vResult,g_XMOne);
4358    // Scale by multiplication
4359    vResult = _mm_mul_ps(vResult,ScaleDHenN3);
4360    // Convert to int
4361    __m128i vResulti = _mm_cvttps_epi32(vResult);
4362    // Mask off any fraction
4363    vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
4364    // Do a horizontal or of all 4 entries
4365    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4366    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4367    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4368    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4369    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4370#else // _XM_VMX128_INTRINSICS_
4371#endif // _XM_VMX128_INTRINSICS_
4372}
4373
4374//------------------------------------------------------------------------------
4375
4376XMFINLINE VOID XMStoreDHen3
4377(
4378    XMDHEN3* pDestination,
4379    FXMVECTOR V
4380)
4381{
4382#if defined(_XM_NO_INTRINSICS_)
4383
4384    XMVECTOR               N;
4385    static CONST XMVECTOR  Min = {-511.0f, -1023.0f, -1023.0f, -1.0f};
4386    static CONST XMVECTOR  Max = {511.0f, 1023.0f, 1023.0f, 1.0f};
4387
4388    XMASSERT(pDestination);
4389
4390    N = XMVectorClamp(V, Min, Max);
4391
4392    pDestination->v = (((INT)N.vector4_f32[2] & 0x7FF) << 21) |
4393                      (((INT)N.vector4_f32[1] & 0x7FF) << 10) |
4394                      (((INT)N.vector4_f32[0] & 0x3FF));
4395
4396#elif defined(_XM_SSE_INTRINSICS_)
4397    XMASSERT(pDestination);
4398    static const XMVECTORF32 MinDHen3 = {-511.0f,-1023.0f,-1023.0f,-1.0f};
4399    static const XMVECTORF32 MaxDHen3 = { 511.0f, 1023.0f, 1023.0f, 1.0f};
4400    static const XMVECTORF32 ScaleDHen3 = {1.0f, 1024.0f,(1024.0f*2048.0f),1.0f};
4401    // Clamp to bounds
4402    XMVECTOR vResult = _mm_max_ps(V,MinDHen3);
4403    vResult = _mm_min_ps(vResult,MaxDHen3);
4404    // Scale by multiplication
4405    vResult = _mm_mul_ps(vResult,ScaleDHen3);
4406    // Convert to int
4407    __m128i vResulti = _mm_cvttps_epi32(vResult);
4408    // Mask off any fraction
4409    vResulti = _mm_and_si128(vResulti,g_XMMaskDHen3);
4410    // Do a horizontal or of all 4 entries
4411    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
4412    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4413    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
4414    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
4415    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
4416#else // _XM_VMX128_INTRINSICS_
4417#endif // _XM_VMX128_INTRINSICS_
4418}
4419
4420//------------------------------------------------------------------------------
4421
4422XMFINLINE VOID XMStoreU565
4423(
4424    XMU565* pDestination,
4425    FXMVECTOR V
4426)
4427{
4428#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
4429    XMASSERT(pDestination);
4430    static CONST XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
4431    // Bounds check
4432    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4433    vResult = _mm_min_ps(vResult,Max);
4434     // Convert to int with rounding
4435    __m128i vInt = _mm_cvtps_epi32(vResult);
4436    // No SSE operations will write to 16-bit values, so we have to extract them manually
4437    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
4438    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
4439    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
4440    pDestination->v = ((z & 0x1F) << 11) |
4441                      ((y & 0x3F) << 5) |
4442                      ((x & 0x1F));
4443#else
4444    XMVECTOR               N;
4445    static CONST XMVECTORF32  Max = {31.0f, 63.0f, 31.0f, 0.0f};
4446
4447    XMASSERT(pDestination);
4448
4449    N = XMVectorClamp(V, XMVectorZero(), Max.v);
4450    N = XMVectorRound(N);
4451
4452    pDestination->v = (((USHORT)N.vector4_f32[2] & 0x1F) << 11) |
4453                      (((USHORT)N.vector4_f32[1] & 0x3F) << 5) |
4454                      (((USHORT)N.vector4_f32[0] & 0x1F));
4455#endif !_XM_SSE_INTRINSICS_
4456}
4457
4458//------------------------------------------------------------------------------
4459
4460XMFINLINE VOID XMStoreFloat3PK
4461(
4462    XMFLOAT3PK* pDestination,
4463    FXMVECTOR V
4464)
4465{
4466    _DECLSPEC_ALIGN_16_ UINT IValue[4];
4467    UINT I, Sign, j;
4468    UINT Result[3];
4469
4470    XMASSERT(pDestination);
4471
4472    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
4473
4474    // X & Y Channels (5-bit exponent, 6-bit mantissa)
4475    for(j=0; j < 2; ++j)
4476    {
4477        Sign = IValue[j] & 0x80000000;
4478        I = IValue[j] & 0x7FFFFFFF;
4479
4480        if ((I & 0x7F800000) == 0x7F800000)
4481        {
4482            // INF or NAN
4483            Result[j] = 0x7c0;
4484            if (( I & 0x7FFFFF ) != 0)
4485            {
4486                Result[j] = 0x7c0 | (((I>>17)|(I>11)|(I>>6)|(I))&0x3f);
4487            }
4488            else if ( Sign )
4489            {
4490                // -INF is clamped to 0 since 3PK is positive only
4491                Result[j] = 0;
4492            }
4493        }
4494        else if ( Sign )
4495        {
4496            // 3PK is positive only, so clamp to zero
4497            Result[j] = 0;
4498        }
4499        else if (I > 0x477E0000U)
4500        {
4501            // The number is too large to be represented as a float11, set to max
4502            Result[j] = 0x7BF;
4503        }
4504        else
4505        {
4506            if (I < 0x38800000U)
4507            {
4508                // The number is too small to be represented as a normalized float11
4509                // Convert it to a denormalized value.
4510                UINT Shift = 113U - (I >> 23U);
4511                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4512            }
4513            else
4514            {
4515                // Rebias the exponent to represent the value as a normalized float11
4516                I += 0xC8000000U;
4517            }
4518
4519            Result[j] = ((I + 0xFFFFU + ((I >> 17U) & 1U)) >> 17U)&0x7ffU;
4520        }
4521    }
4522
4523    // Z Channel (5-bit exponent, 5-bit mantissa)
4524    Sign = IValue[2] & 0x80000000;
4525    I = IValue[2] & 0x7FFFFFFF;
4526
4527    if ((I & 0x7F800000) == 0x7F800000)
4528    {
4529        // INF or NAN
4530        Result[2] = 0x3e0;
4531        if ( I & 0x7FFFFF )
4532        {
4533            Result[2] = 0x3e0 | (((I>>18)|(I>13)|(I>>3)|(I))&0x1f);
4534        }
4535        else if ( Sign )
4536        {
4537            // -INF is clamped to 0 since 3PK is positive only
4538            Result[2] = 0;
4539        }
4540    }
4541    else if ( Sign )
4542    {
4543        // 3PK is positive only, so clamp to zero
4544        Result[2] = 0;
4545    }
4546    else if (I > 0x477C0000U)
4547    {
4548        // The number is too large to be represented as a float10, set to max
4549        Result[2] = 0x3df;
4550    }
4551    else
4552    {
4553        if (I < 0x38800000U)
4554        {
4555            // The number is too small to be represented as a normalized float10
4556            // Convert it to a denormalized value.
4557            UINT Shift = 113U - (I >> 23U);
4558            I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4559        }
4560        else
4561        {
4562            // Rebias the exponent to represent the value as a normalized float10
4563            I += 0xC8000000U;
4564        }
4565
4566        Result[2] = ((I + 0x1FFFFU + ((I >> 18U) & 1U)) >> 18U)&0x3ffU;
4567    }
4568
4569    // Pack Result into memory
4570    pDestination->v = (Result[0] & 0x7ff)
4571                      | ( (Result[1] & 0x7ff) << 11 )
4572                      | ( (Result[2] & 0x3ff) << 22 );
4573}
4574
4575
4576//------------------------------------------------------------------------------
4577
4578XMFINLINE VOID XMStoreFloat3SE
4579(
4580    XMFLOAT3SE* pDestination,
4581    FXMVECTOR V
4582)
4583{
4584    _DECLSPEC_ALIGN_16_ UINT IValue[4];
4585    UINT I, Sign, j, T;
4586    UINT Frac[3];
4587    UINT Exp[3];
4588
4589
4590    XMASSERT(pDestination);
4591
4592    XMStoreFloat3A( (XMFLOAT3A*)&IValue, V );
4593
4594    // X, Y, Z Channels (5-bit exponent, 9-bit mantissa)
4595    for(j=0; j < 3; ++j)
4596    {
4597        Sign = IValue[j] & 0x80000000;
4598        I = IValue[j] & 0x7FFFFFFF;
4599
4600        if ((I & 0x7F800000) == 0x7F800000)
4601        {
4602            // INF or NAN
4603            Exp[j] = 0x1f;
4604            if (( I & 0x7FFFFF ) != 0)
4605            {
4606                Frac[j] = ((I>>14)|(I>5)|(I))&0x1ff;
4607            }
4608            else if ( Sign )
4609            {
4610                // -INF is clamped to 0 since 3SE is positive only
4611                Exp[j] = Frac[j] = 0;
4612            }
4613        }
4614        else if ( Sign )
4615        {
4616            // 3SE is positive only, so clamp to zero
4617            Exp[j] = Frac[j] = 0;
4618        }
4619        else if (I > 0x477FC000U)
4620        {
4621            // The number is too large, set to max
4622            Exp[j] = 0x1e;
4623            Frac[j] = 0x1ff;
4624        }
4625        else
4626        {
4627            if (I < 0x38800000U)
4628            {
4629                // The number is too small to be represented as a normalized float11
4630                // Convert it to a denormalized value.
4631                UINT Shift = 113U - (I >> 23U);
4632                I = (0x800000U | (I & 0x7FFFFFU)) >> Shift;
4633            }
4634            else
4635            {
4636                // Rebias the exponent to represent the value as a normalized float11
4637                I += 0xC8000000U;
4638            }
4639
4640            T = ((I + 0x1FFFU + ((I >> 14U) & 1U)) >> 14U)&0x3fffU;
4641
4642            Exp[j] = (T & 0x3E00) >> 9;
4643            Frac[j] = T & 0x1ff;
4644        }
4645    }
4646
4647    // Adjust to a shared exponent
4648    T = XMMax( Exp[0], XMMax( Exp[1], Exp[2] ) );
4649
4650    Frac[0] = Frac[0] >> (T - Exp[0]);
4651    Frac[1] = Frac[1] >> (T - Exp[1]);
4652    Frac[2] = Frac[2] >> (T - Exp[2]);
4653
4654    // Store packed into memory
4655    pDestination->xm = Frac[0];
4656    pDestination->ym = Frac[1];
4657    pDestination->zm = Frac[2];
4658    pDestination->e = T;
4659}
4660
4661//------------------------------------------------------------------------------
4662
4663XMFINLINE VOID XMStoreInt4
4664(
4665    UINT*    pDestination,
4666    FXMVECTOR V
4667)
4668{
4669#if defined(_XM_NO_INTRINSICS_)
4670
4671    XMASSERT(pDestination);
4672
4673    pDestination[0] = V.vector4_u32[0];
4674    pDestination[1] = V.vector4_u32[1];
4675    pDestination[2] = V.vector4_u32[2];
4676    pDestination[3] = V.vector4_u32[3];
4677
4678#elif defined(_XM_SSE_INTRINSICS_)
4679    XMASSERT(pDestination);
4680
4681    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4682
4683#else // _XM_VMX128_INTRINSICS_
4684#endif // _XM_VMX128_INTRINSICS_
4685}
4686
4687//------------------------------------------------------------------------------
4688
4689XMFINLINE VOID XMStoreInt4A
4690(
4691    UINT*    pDestination,
4692    FXMVECTOR V
4693)
4694{
4695#if defined(_XM_NO_INTRINSICS_)
4696
4697    XMASSERT(pDestination);
4698    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4699
4700    pDestination[0] = V.vector4_u32[0];
4701    pDestination[1] = V.vector4_u32[1];
4702    pDestination[2] = V.vector4_u32[2];
4703    pDestination[3] = V.vector4_u32[3];
4704
4705#elif defined(_XM_SSE_INTRINSICS_)
4706    XMASSERT(pDestination);
4707    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4708
4709    _mm_store_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4710
4711#else // _XM_VMX128_INTRINSICS_
4712#endif // _XM_VMX128_INTRINSICS_
4713}
4714
4715//------------------------------------------------------------------------------
4716
4717XMFINLINE VOID XMStoreSInt4
4718(
4719    XMINT4* pDestination,
4720    FXMVECTOR V
4721)
4722{
4723#if defined(_XM_NO_INTRINSICS_)
4724
4725    XMASSERT(pDestination);
4726
4727    pDestination->x = (INT)V.vector4_f32[0];
4728    pDestination->y = (INT)V.vector4_f32[1];
4729    pDestination->z = (INT)V.vector4_f32[2];
4730    pDestination->w = (INT)V.vector4_f32[3];
4731
4732#elif defined(_XM_SSE_INTRINSICS_)
4733    XMASSERT(pDestination);
4734
4735    // In case of positive overflow, detect it
4736    XMVECTOR vOverflow = _mm_cmpgt_ps(V,g_XMMaxInt);
4737    // Float to int conversion
4738    __m128i vResulti = _mm_cvttps_epi32(V);
4739    // If there was positive overflow, set to 0x7FFFFFFF
4740    XMVECTOR vResult = _mm_and_ps(vOverflow,g_XMAbsMask);
4741    vOverflow = _mm_andnot_ps(vOverflow,reinterpret_cast<const __m128 *>(&vResulti)[0]);
4742    vOverflow = _mm_or_ps(vOverflow,vResult);
4743    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vOverflow)[0] );
4744
4745#else // _XM_VMX128_INTRINSICS_
4746#endif // _XM_VMX128_INTRINSICS_
4747}
4748
4749//------------------------------------------------------------------------------
4750
4751XMFINLINE VOID XMStoreUInt4
4752(
4753    XMUINT4* pDestination,
4754    FXMVECTOR V
4755)
4756{
4757#if defined(_XM_NO_INTRINSICS_)
4758
4759    XMASSERT(pDestination);
4760
4761    pDestination->x = (UINT)V.vector4_f32[0];
4762    pDestination->y = (UINT)V.vector4_f32[1];
4763    pDestination->z = (UINT)V.vector4_f32[2];
4764    pDestination->w = (UINT)V.vector4_f32[3];
4765
4766#elif defined(_XM_SSE_INTRINSICS_)
4767    XMASSERT(pDestination);
4768
4769    // Clamp to >=0
4770    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
4771    // Any numbers that are too big, set to 0xFFFFFFFFU
4772    XMVECTOR vOverflow = _mm_cmpgt_ps(vResult,g_XMMaxUInt);
4773    XMVECTOR vValue = g_XMUnsignedFix;
4774    // Too large for a signed integer?
4775    XMVECTOR vMask = _mm_cmpge_ps(vResult,vValue);
4776    // Zero for number's lower than 0x80000000, 32768.0f*65536.0f otherwise
4777    vValue = _mm_and_ps(vValue,vMask);
4778    // Perform fixup only on numbers too large (Keeps low bit precision)
4779    vResult = _mm_sub_ps(vResult,vValue);
4780    __m128i vResulti = _mm_cvttps_epi32(vResult);
4781    // Convert from signed to unsigned pnly if greater than 0x80000000
4782    vMask = _mm_and_ps(vMask,g_XMNegativeZero);
4783    vResult = _mm_xor_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],vMask);
4784    // On those that are too large, set to 0xFFFFFFFF
4785    vResult = _mm_or_ps(vResult,vOverflow);
4786    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&vResult)[0] );
4787
4788#else // _XM_VMX128_INTRINSICS_
4789#endif // _XM_VMX128_INTRINSICS_
4790}
4791
4792//------------------------------------------------------------------------------
4793
4794XMFINLINE VOID XMStoreInt4NC
4795(
4796    UINT*    pDestination,
4797    FXMVECTOR V
4798)
4799{
4800#if defined(_XM_NO_INTRINSICS_)
4801
4802    XMASSERT(pDestination);
4803    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4804
4805    pDestination[0] = V.vector4_u32[0];
4806    pDestination[1] = V.vector4_u32[1];
4807    pDestination[2] = V.vector4_u32[2];
4808    pDestination[3] = V.vector4_u32[3];
4809
4810#elif defined(_XM_SSE_INTRINSICS_)
4811    XMASSERT(pDestination);
4812    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4813
4814    _mm_storeu_si128( (__m128i*)pDestination, reinterpret_cast<const __m128i *>(&V)[0] );
4815
4816#else // _XM_VMX128_INTRINSICS_
4817#endif // _XM_VMX128_INTRINSICS_
4818}
4819
4820//------------------------------------------------------------------------------
4821
4822XMFINLINE VOID XMStoreFloat4
4823(
4824    XMFLOAT4* pDestination,
4825    FXMVECTOR  V
4826)
4827{
4828#if defined(_XM_NO_INTRINSICS_)
4829
4830    XMASSERT(pDestination);
4831
4832    pDestination->x = V.vector4_f32[0];
4833    pDestination->y = V.vector4_f32[1];
4834    pDestination->z = V.vector4_f32[2];
4835    pDestination->w = V.vector4_f32[3];
4836
4837#elif defined(_XM_SSE_INTRINSICS_)
4838    XMASSERT(pDestination);
4839
4840    _mm_storeu_ps( &pDestination->x, V );
4841
4842#else // _XM_VMX128_INTRINSICS_
4843#endif // _XM_VMX128_INTRINSICS_
4844}
4845
4846//------------------------------------------------------------------------------
4847
4848XMFINLINE VOID XMStoreFloat4A
4849(
4850    XMFLOAT4A*   pDestination,
4851    FXMVECTOR     V
4852)
4853{
4854#if defined(_XM_NO_INTRINSICS_)
4855
4856    XMASSERT(pDestination);
4857    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4858
4859    pDestination->x = V.vector4_f32[0];
4860    pDestination->y = V.vector4_f32[1];
4861    pDestination->z = V.vector4_f32[2];
4862    pDestination->w = V.vector4_f32[3];
4863
4864#elif defined(_XM_SSE_INTRINSICS_)
4865    XMASSERT(pDestination);
4866    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
4867
4868    _mm_store_ps( &pDestination->x, V );
4869#else // _XM_VMX128_INTRINSICS_
4870#endif // _XM_VMX128_INTRINSICS_
4871}
4872
4873//------------------------------------------------------------------------------
4874
4875XMFINLINE VOID XMStoreFloat4NC
4876(
4877    XMFLOAT4* pDestination,
4878    FXMVECTOR  V
4879)
4880{
4881#if defined(_XM_NO_INTRINSICS_)
4882
4883    XMASSERT(pDestination);
4884    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4885
4886    pDestination->x = V.vector4_f32[0];
4887    pDestination->y = V.vector4_f32[1];
4888    pDestination->z = V.vector4_f32[2];
4889    pDestination->w = V.vector4_f32[3];
4890
4891#elif defined(_XM_SSE_INTRINSICS_)
4892    XMASSERT(pDestination);
4893    XMASSERT(((UINT_PTR)pDestination & 3) == 0);
4894
4895    _mm_storeu_ps( &pDestination->x, V );
4896
4897#else // _XM_VMX128_INTRINSICS_
4898#endif // _XM_VMX128_INTRINSICS_
4899}
4900
4901//------------------------------------------------------------------------------
4902
4903XMFINLINE VOID XMStoreHalf4
4904(
4905    XMHALF4* pDestination,
4906    FXMVECTOR V
4907)
4908{
4909#if defined(_XM_NO_INTRINSICS_)
4910
4911    XMASSERT(pDestination);
4912
4913    pDestination->x = XMConvertFloatToHalf(V.vector4_f32[0]);
4914    pDestination->y = XMConvertFloatToHalf(V.vector4_f32[1]);
4915    pDestination->z = XMConvertFloatToHalf(V.vector4_f32[2]);
4916    pDestination->w = XMConvertFloatToHalf(V.vector4_f32[3]);
4917
4918#elif defined(_XM_SSE_INTRINSICS_)
4919    XMASSERT(pDestination);
4920    pDestination->x = XMConvertFloatToHalf(XMVectorGetX(V));
4921    pDestination->y = XMConvertFloatToHalf(XMVectorGetY(V));
4922    pDestination->z = XMConvertFloatToHalf(XMVectorGetZ(V));
4923    pDestination->w = XMConvertFloatToHalf(XMVectorGetW(V));
4924#else // _XM_VMX128_INTRINSICS_
4925#endif // _XM_VMX128_INTRINSICS_
4926}
4927
4928//------------------------------------------------------------------------------
4929
4930XMFINLINE VOID XMStoreShortN4
4931(
4932    XMSHORTN4* pDestination,
4933    FXMVECTOR   V
4934)
4935{
4936#if defined(_XM_NO_INTRINSICS_)
4937
4938    XMVECTOR               N;
4939    static CONST XMVECTORF32  Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4940
4941    XMASSERT(pDestination);
4942
4943    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
4944    N = XMVectorMultiply(N, Scale.v);
4945    N = XMVectorRound(N);
4946
4947    pDestination->x = (SHORT)N.vector4_f32[0];
4948    pDestination->y = (SHORT)N.vector4_f32[1];
4949    pDestination->z = (SHORT)N.vector4_f32[2];
4950    pDestination->w = (SHORT)N.vector4_f32[3];
4951
4952#elif defined(_XM_SSE_INTRINSICS_)
4953    XMASSERT(pDestination);
4954    static CONST XMVECTORF32 Scale = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4955
4956    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
4957    vResult = _mm_min_ps(vResult,g_XMOne);
4958    vResult = _mm_mul_ps(vResult,Scale);
4959    __m128i vResulti = _mm_cvtps_epi32(vResult);
4960    vResulti = _mm_packs_epi32(vResulti,vResulti);
4961    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vResulti)[0]);
4962#else // _XM_VMX128_INTRINSICS_
4963#endif // _XM_VMX128_INTRINSICS_
4964}
4965
4966//------------------------------------------------------------------------------
4967
4968XMFINLINE VOID XMStoreShort4
4969(
4970    XMSHORT4* pDestination,
4971    FXMVECTOR  V
4972)
4973{
4974#if defined(_XM_NO_INTRINSICS_)
4975
4976    XMVECTOR               N;
4977    static CONST XMVECTOR  Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4978    static CONST XMVECTOR  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4979
4980    XMASSERT(pDestination);
4981
4982    N = XMVectorClamp(V, Min, Max);
4983    N = XMVectorRound(N);
4984
4985    pDestination->x = (SHORT)N.vector4_f32[0];
4986    pDestination->y = (SHORT)N.vector4_f32[1];
4987    pDestination->z = (SHORT)N.vector4_f32[2];
4988    pDestination->w = (SHORT)N.vector4_f32[3];
4989
4990#elif defined(_XM_SSE_INTRINSICS_)
4991    XMASSERT(pDestination);
4992    static CONST XMVECTORF32 Min = {-32767.0f, -32767.0f, -32767.0f, -32767.0f};
4993    static CONST XMVECTORF32  Max = {32767.0f, 32767.0f, 32767.0f, 32767.0f};
4994    // Bounds check
4995    XMVECTOR vResult = _mm_max_ps(V,Min);
4996    vResult = _mm_min_ps(vResult,Max);
4997     // Convert to int with rounding
4998    __m128i vInt = _mm_cvtps_epi32(vResult);
4999    // Pack the ints into shorts
5000    vInt = _mm_packs_epi32(vInt,vInt);
5001    _mm_store_sd(reinterpret_cast<double *>(&pDestination->x),reinterpret_cast<const __m128d *>(&vInt)[0]);
5002#else // _XM_VMX128_INTRINSICS_
5003#endif // _XM_VMX128_INTRINSICS_
5004}
5005
5006//------------------------------------------------------------------------------
5007
5008XMFINLINE VOID XMStoreUShortN4
5009(
5010    XMUSHORTN4* pDestination,
5011    FXMVECTOR    V
5012)
5013{
5014#if defined(_XM_NO_INTRINSICS_)
5015
5016    XMVECTOR               N;
5017    static CONST XMVECTORF32  Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5018
5019    XMASSERT(pDestination);
5020
5021    N = XMVectorSaturate(V);
5022    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
5023    N = XMVectorTruncate(N);
5024
5025    pDestination->x = (SHORT)N.vector4_f32[0];
5026    pDestination->y = (SHORT)N.vector4_f32[1];
5027    pDestination->z = (SHORT)N.vector4_f32[2];
5028    pDestination->w = (SHORT)N.vector4_f32[3];
5029
5030#elif defined(_XM_SSE_INTRINSICS_)
5031    XMASSERT(pDestination);
5032    static CONST XMVECTORF32 Scale = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5033    // Bounds check
5034    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5035    vResult = _mm_min_ps(vResult,g_XMOne);
5036    vResult = _mm_mul_ps(vResult,Scale);
5037    // Convert to int with rounding
5038    __m128i vInt = _mm_cvtps_epi32(vResult);
5039    // Since the SSE pack instruction clamps using signed rules,
5040    // manually extract the values to store them to memory
5041    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
5042    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
5043    pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
5044    pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
5045#else // _XM_VMX128_INTRINSICS_
5046#endif // _XM_VMX128_INTRINSICS_
5047}
5048
5049//------------------------------------------------------------------------------
5050
5051XMFINLINE VOID XMStoreUShort4
5052(
5053    XMUSHORT4* pDestination,
5054    FXMVECTOR   V
5055)
5056{
5057#if defined(_XM_NO_INTRINSICS_)
5058
5059    XMVECTOR               N;
5060    static CONST XMVECTOR  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5061
5062    XMASSERT(pDestination);
5063
5064    N = XMVectorClamp(V, XMVectorZero(), Max);
5065    N = XMVectorRound(N);
5066
5067    pDestination->x = (SHORT)N.vector4_f32[0];
5068    pDestination->y = (SHORT)N.vector4_f32[1];
5069    pDestination->z = (SHORT)N.vector4_f32[2];
5070    pDestination->w = (SHORT)N.vector4_f32[3];
5071
5072#elif defined(_XM_SSE_INTRINSICS_)
5073    XMASSERT(pDestination);
5074    static CONST XMVECTORF32  Max = {65535.0f, 65535.0f, 65535.0f, 65535.0f};
5075    // Bounds check
5076    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5077    vResult = _mm_min_ps(vResult,Max);
5078     // Convert to int with rounding
5079    __m128i vInt = _mm_cvtps_epi32(vResult);
5080    // Since the SSE pack instruction clamps using signed rules,
5081    // manually extract the values to store them to memory
5082    pDestination->x = static_cast<SHORT>(_mm_extract_epi16(vInt,0));
5083    pDestination->y = static_cast<SHORT>(_mm_extract_epi16(vInt,2));
5084    pDestination->z = static_cast<SHORT>(_mm_extract_epi16(vInt,4));
5085    pDestination->w = static_cast<SHORT>(_mm_extract_epi16(vInt,6));
5086#else // _XM_VMX128_INTRINSICS_
5087#endif // _XM_VMX128_INTRINSICS_
5088}
5089
5090//------------------------------------------------------------------------------
5091
5092XMFINLINE VOID XMStoreXIcoN4
5093(
5094    XMXICON4*  pDestination,
5095    FXMVECTOR   V
5096)
5097{
5098#if defined(_XM_NO_INTRINSICS_)
5099
5100    XMVECTOR               N;
5101    static CONST XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5102    static CONST XMVECTORF32  Scale = {524287.0f, 524287.0f, 524287.0f, 15.0f};
5103
5104    XMASSERT(pDestination);
5105
5106    N = XMVectorClamp(V, Min.v, g_XMOne.v);
5107    N = XMVectorMultiply(N, Scale.v);
5108    N = XMVectorRound(N);
5109
5110    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5111                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5112                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5113                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
5114
5115#elif defined(_XM_SSE_INTRINSICS_)
5116    XMASSERT(pDestination);
5117    // Note: Masks are x,w,y and z
5118    static const XMVECTORF32 MinXIcoN4 = {-1.0f, 0.0f,-1.0f,-1.0f};
5119    static const XMVECTORF32 ScaleXIcoN4 = {524287.0f,15.0f*4096.0f*65536.0f*0.5f,524287.0f*4096.0f,524287.0f};
5120    static const XMVECTORI32 MaskXIcoN4 = {0xFFFFF,0xF<<((60-32)-1),0xFFFFF000,0xFFFFF};
5121
5122    // Clamp to bounds
5123    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5124    vResult = _mm_max_ps(vResult,MinXIcoN4);
5125    vResult = _mm_min_ps(vResult,g_XMOne);
5126    // Scale by multiplication
5127    vResult = _mm_mul_ps(vResult,ScaleXIcoN4);
5128    // Convert to integer (w is unsigned)
5129    __m128i vResulti = _mm_cvttps_epi32(vResult);
5130    // Mask off unused bits
5131    vResulti = _mm_and_si128(vResulti,MaskXIcoN4);
5132    // Isolate Y
5133    __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
5134    // Double Y (Really W) to fixup for unsigned conversion
5135    vResulti = _mm_add_epi32(vResulti,vResulti2);
5136    // Shift y and z to straddle the 32-bit boundary
5137    vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5138    // Shift it into place
5139    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5140    // i = x|y<<20|z<<40|w<<60
5141    vResulti = _mm_or_si128(vResulti,vResulti2);
5142    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5143#else // _XM_VMX128_INTRINSICS_
5144#endif // _XM_VMX128_INTRINSICS_
5145}
5146
5147//------------------------------------------------------------------------------
5148
5149XMFINLINE VOID XMStoreXIco4
5150(
5151    XMXICO4*  pDestination,
5152    FXMVECTOR  V
5153)
5154{
5155#if defined(_XM_NO_INTRINSICS_)
5156
5157    XMVECTOR N;
5158    static CONST XMVECTORF32 Min = {-524287.0f, -524287.0f, -524287.0f, 0.0f};
5159    static CONST XMVECTORF32 Max = {524287.0f, 524287.0f, 524287.0f, 15.0f};
5160
5161    XMASSERT(pDestination);
5162    N = XMVectorClamp(V, Min.v, Max.v);
5163    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5164                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5165                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5166                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
5167
5168#elif defined(_XM_SSE_INTRINSICS_)
5169    XMASSERT(pDestination);
5170    // Note: Masks are x,w,y and z
5171    static const XMVECTORF32 MinXIco4 = {-524287.0f, 0.0f,-524287.0f,-524287.0f};
5172    static const XMVECTORF32 MaxXIco4 = { 524287.0f,15.0f, 524287.0f, 524287.0f};
5173    static const XMVECTORF32 ScaleXIco4 = {1.0f,4096.0f*65536.0f*0.5f,4096.0f,1.0f};
5174    static const XMVECTORI32 MaskXIco4 = {0xFFFFF,0xF<<((60-1)-32),0xFFFFF000,0xFFFFF};
5175    // Clamp to bounds
5176    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5177    vResult = _mm_max_ps(vResult,MinXIco4);
5178    vResult = _mm_min_ps(vResult,MaxXIco4);
5179    // Scale by multiplication
5180    vResult = _mm_mul_ps(vResult,ScaleXIco4);
5181    // Convert to int
5182    __m128i vResulti = _mm_cvttps_epi32(vResult);
5183    // Mask off any fraction
5184    vResulti = _mm_and_si128(vResulti,MaskXIco4);
5185    // Isolate Y
5186    __m128i vResulti2 = _mm_and_si128(vResulti,g_XMMaskY);
5187    // Double Y (Really W) to fixup for unsigned conversion
5188    vResulti = _mm_add_epi32(vResulti,vResulti2);
5189    // Shift y and z to straddle the 32-bit boundary
5190    vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5191    // Shift it into place
5192    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5193    // i = x|y<<20|z<<40|w<<60
5194    vResulti = _mm_or_si128(vResulti,vResulti2);
5195    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5196#else // _XM_VMX128_INTRINSICS_
5197#endif // _XM_VMX128_INTRINSICS_
5198}
5199
5200//------------------------------------------------------------------------------
5201
5202XMFINLINE VOID XMStoreUIcoN4
5203(
5204    XMUICON4*  pDestination,
5205    FXMVECTOR   V
5206)
5207{
5208    #define XM_URange       ((FLOAT)(1 << 20))
5209    #define XM_URangeDiv2   ((FLOAT)(1 << 19))
5210    #define XM_UMaxXYZ      ((FLOAT)((1 << 20) - 1))
5211    #define XM_UMaxW        ((FLOAT)((1 << 4) - 1))
5212    #define XM_ScaleXYZ     (-(FLOAT)((1 << 20) - 1) / XM_PACK_FACTOR)
5213    #define XM_ScaleW       (-(FLOAT)((1 << 4) - 1) / XM_PACK_FACTOR)
5214    #define XM_Scale        (-1.0f / XM_PACK_FACTOR)
5215    #define XM_Offset       (3.0f)
5216
5217#if defined(_XM_NO_INTRINSICS_)
5218
5219    XMVECTOR               N;
5220    static CONST XMVECTORF32 Scale = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
5221
5222    XMASSERT(pDestination);
5223
5224    N = XMVectorSaturate(V);
5225    N = XMVectorMultiplyAdd(N, Scale.v, g_XMOneHalf.v);
5226
5227    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5228                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5229                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5230                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5231
5232#elif defined(_XM_SSE_INTRINSICS_)
5233    XMASSERT(pDestination);
5234    // Note: Masks are x,w,y and z
5235    static const XMVECTORF32 ScaleUIcoN4 = {1048575.0f,15.0f*4096.0f*65536.0f,1048575.0f*4096.0f,1048575.0f};
5236    static const XMVECTORI32 MaskUIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5237    static const XMVECTORF32 AddUIcoN4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
5238    // Clamp to bounds
5239    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5240    vResult = _mm_max_ps(vResult,g_XMZero);
5241    vResult = _mm_min_ps(vResult,g_XMOne);
5242    // Scale by multiplication
5243    vResult = _mm_mul_ps(vResult,ScaleUIcoN4);
5244    // Adjust for unsigned entries
5245    vResult = _mm_add_ps(vResult,AddUIcoN4);
5246    // Convert to int
5247    __m128i vResulti = _mm_cvttps_epi32(vResult);
5248    // Fix the signs on the unsigned entries
5249    vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
5250    // Mask off any fraction
5251    vResulti = _mm_and_si128(vResulti,MaskUIcoN4);
5252    // Shift y and z to straddle the 32-bit boundary
5253    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5254    // Shift it into place
5255    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5256    // i = x|y<<20|z<<40|w<<60
5257    vResulti = _mm_or_si128(vResulti,vResulti2);
5258    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5259#else // _XM_VMX128_INTRINSICS_
5260#endif // _XM_VMX128_INTRINSICS_
5261
5262    #undef XM_URange
5263    #undef XM_URangeDiv2
5264    #undef XM_UMaxXYZ
5265    #undef XM_UMaxW
5266    #undef XM_ScaleXYZ
5267    #undef XM_ScaleW
5268    #undef XM_Scale
5269    #undef XM_Offset
5270}
5271
5272//------------------------------------------------------------------------------
5273
5274XMFINLINE VOID XMStoreUIco4
5275(
5276    XMUICO4*  pDestination,
5277    FXMVECTOR  V
5278)
5279{
5280    #define XM_Scale        (-1.0f / XM_PACK_FACTOR)
5281    #define XM_URange       ((FLOAT)(1 << 20))
5282    #define XM_URangeDiv2   ((FLOAT)(1 << 19))
5283
5284#if defined(_XM_NO_INTRINSICS_)
5285
5286    XMVECTOR               N;
5287    static CONST XMVECTOR  Max = {1048575.0f, 1048575.0f, 1048575.0f, 15.0f};
5288
5289    XMASSERT(pDestination);
5290
5291    N = XMVectorClamp(V, XMVectorZero(), Max);
5292    N = XMVectorRound(N);
5293
5294    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5295                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5296                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5297                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5298
5299#elif defined(_XM_SSE_INTRINSICS_)
5300    XMASSERT(pDestination);
5301    // Note: Masks are x,w,y and z
5302    static const XMVECTORF32 MaxUIco4 = { 1048575.0f, 15.0f, 1048575.0f, 1048575.0f};
5303    static const XMVECTORF32 ScaleUIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
5304    static const XMVECTORI32 MaskUIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5305    static const XMVECTORF32 AddUIco4 = {0.0f,-32768.0f*65536.0f,-32768.0f*65536.0f,0.0f};
5306    // Clamp to bounds
5307    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5308    vResult = _mm_max_ps(vResult,g_XMZero);
5309    vResult = _mm_min_ps(vResult,MaxUIco4);
5310    // Scale by multiplication
5311    vResult = _mm_mul_ps(vResult,ScaleUIco4);
5312    vResult = _mm_add_ps(vResult,AddUIco4);
5313    // Convert to int
5314    __m128i vResulti = _mm_cvttps_epi32(vResult);
5315    vResulti = _mm_xor_si128(vResulti,g_XMFlipYZ);
5316    // Mask off any fraction
5317    vResulti = _mm_and_si128(vResulti,MaskUIco4);
5318    // Shift y and z to straddle the 32-bit boundary
5319    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5320    // Shift it into place
5321    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5322    // i = x|y<<20|z<<40|w<<60
5323    vResulti = _mm_or_si128(vResulti,vResulti2);
5324    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5325#else // _XM_VMX128_INTRINSICS_
5326#endif // _XM_VMX128_INTRINSICS_
5327
5328    #undef XM_Scale
5329    #undef XM_URange
5330    #undef XM_URangeDiv2
5331}
5332
5333//------------------------------------------------------------------------------
5334
5335XMFINLINE VOID XMStoreIcoN4
5336(
5337    XMICON4*  pDestination,
5338    FXMVECTOR  V
5339)
5340{
5341    #define XM_Scale    (-1.0f / XM_PACK_FACTOR)
5342    #define XM_URange   ((FLOAT)(1 << 4))
5343    #define XM_Offset   (3.0f)
5344    #define XM_UMaxXYZ  ((FLOAT)((1 << (20 - 1)) - 1))
5345    #define XM_UMaxW    ((FLOAT)((1 << (4 - 1)) - 1))
5346
5347#if defined(_XM_NO_INTRINSICS_)
5348
5349    XMVECTOR               N;
5350    static CONST XMVECTORF32  Scale = {524287.0f, 524287.0f, 524287.0f, 7.0f};
5351
5352    XMASSERT(pDestination);
5353
5354    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5355    N = XMVectorMultiplyAdd(N, Scale.v, g_XMNegativeZero.v);
5356    N = XMVectorRound(N);
5357
5358    pDestination->v = ((UINT64)N.vector4_f32[3] << 60) |
5359                       (((UINT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5360                       (((UINT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5361                       (((UINT64)N.vector4_f32[0] & 0xFFFFF));
5362
5363#elif defined(_XM_SSE_INTRINSICS_)
5364    XMASSERT(pDestination);
5365    // Note: Masks are x,w,y and z
5366    static const XMVECTORF32 ScaleIcoN4 = {524287.0f,7.0f*4096.0f*65536.0f,524287.0f*4096.0f,524287.0f};
5367    static const XMVECTORI32 MaskIcoN4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5368    // Clamp to bounds
5369    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5370    vResult = _mm_max_ps(vResult,g_XMNegativeOne);
5371    vResult = _mm_min_ps(vResult,g_XMOne);
5372    // Scale by multiplication
5373    vResult = _mm_mul_ps(vResult,ScaleIcoN4);
5374    // Convert to int
5375    __m128i vResulti = _mm_cvttps_epi32(vResult);
5376    // Mask off any fraction
5377    vResulti = _mm_and_si128(vResulti,MaskIcoN4);
5378    // Shift y and z to straddle the 32-bit boundary
5379    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5380    // Shift it into place
5381    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5382    // i = x|y<<20|z<<40|w<<60
5383    vResulti = _mm_or_si128(vResulti,vResulti2);
5384    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5385#else // _XM_VMX128_INTRINSICS_
5386#endif // _XM_VMX128_INTRINSICS_
5387
5388    #undef XM_Scale
5389    #undef XM_URange
5390    #undef XM_Offset
5391    #undef XM_UMaxXYZ
5392    #undef XM_UMaxW
5393}
5394
5395//------------------------------------------------------------------------------
5396
5397XMFINLINE VOID XMStoreIco4
5398(
5399    XMICO4*  pDestination,
5400    FXMVECTOR V
5401)
5402{
5403    #define XM_Scale    (-1.0f / XM_PACK_FACTOR)
5404    #define XM_URange   ((FLOAT)(1 << 4))
5405    #define XM_Offset   (3.0f)
5406
5407#if defined(_XM_NO_INTRINSICS_)
5408
5409    XMVECTOR               N;
5410    static CONST XMVECTOR  Min = {-524287.0f, -524287.0f, -524287.0f, -7.0f};
5411    static CONST XMVECTOR  Max = {524287.0f, 524287.0f, 524287.0f, 7.0f};
5412
5413    XMASSERT(pDestination);
5414
5415    N = XMVectorClamp(V, Min, Max);
5416    N = XMVectorRound(N);
5417
5418    pDestination->v = ((INT64)N.vector4_f32[3] << 60) |
5419                       (((INT64)N.vector4_f32[2] & 0xFFFFF) << 40) |
5420                       (((INT64)N.vector4_f32[1] & 0xFFFFF) << 20) |
5421                       (((INT64)N.vector4_f32[0] & 0xFFFFF));
5422
5423#elif defined(_XM_SSE_INTRINSICS_)
5424    XMASSERT(pDestination);
5425    // Note: Masks are x,w,y and z
5426    static const XMVECTORF32 MinIco4 = {-524287.0f,-7.0f,-524287.0f,-524287.0f};
5427    static const XMVECTORF32 MaxIco4 = { 524287.0f, 7.0f, 524287.0f, 524287.0f};
5428    static const XMVECTORF32 ScaleIco4 = {1.0f,4096.0f*65536.0f,4096.0f,1.0f};
5429    static const XMVECTORI32 MaskIco4 = {0xFFFFF,0xF<<(60-32),0xFFFFF000,0xFFFFF};
5430    // Clamp to bounds
5431    XMVECTOR vResult = _mm_shuffle_ps(V,V,_MM_SHUFFLE(2,1,3,0));
5432    vResult = _mm_max_ps(vResult,MinIco4);
5433    vResult = _mm_min_ps(vResult,MaxIco4);
5434    // Scale by multiplication
5435    vResult = _mm_mul_ps(vResult,ScaleIco4);
5436    // Convert to int
5437    __m128i vResulti = _mm_cvttps_epi32(vResult);
5438    // Mask off any fraction
5439    vResulti = _mm_and_si128(vResulti,MaskIco4);
5440    // Shift y and z to straddle the 32-bit boundary
5441    __m128i vResulti2 = _mm_srli_si128(vResulti,(64+12)/8);
5442    // Shift it into place
5443    vResulti2 = _mm_slli_si128(vResulti2,20/8);
5444    // i = x|y<<20|z<<40|w<<60
5445    vResulti = _mm_or_si128(vResulti,vResulti2);
5446    _mm_store_sd(reinterpret_cast<double *>(&pDestination->v),reinterpret_cast<const __m128d *>(&vResulti)[0]);
5447#else // _XM_VMX128_INTRINSICS_
5448#endif // _XM_VMX128_INTRINSICS_
5449
5450    #undef XM_Scale
5451    #undef XM_URange
5452    #undef XM_Offset
5453}
5454
5455//------------------------------------------------------------------------------
5456
5457XMFINLINE VOID XMStoreXDecN4
5458(
5459    XMXDECN4* pDestination,
5460    FXMVECTOR  V
5461)
5462{
5463#if defined(_XM_NO_INTRINSICS_)
5464
5465    XMVECTOR               N;
5466    static CONST XMVECTORF32  Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5467    static CONST XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 3.0f};
5468
5469    XMASSERT(pDestination);
5470
5471    N = XMVectorClamp(V, Min.v, g_XMOne.v);
5472    N = XMVectorMultiply(N, Scale.v);
5473    N = XMVectorRound(N);
5474
5475    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5476                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5477                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5478                       (((INT)N.vector4_f32[0] & 0x3FF));
5479
5480#elif defined(_XM_SSE_INTRINSICS_)
5481    static const XMVECTORF32 Min = {-1.0f, -1.0f, -1.0f, 0.0f};
5482    static const XMVECTORF32 Scale = {511.0f, 511.0f*1024.0f, 511.0f*1048576.0f,3.0f*536870912.0f};
5483    static const XMVECTORI32 ScaleMask = {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<29};
5484    XMASSERT(pDestination);
5485    XMVECTOR vResult = _mm_max_ps(V,Min);
5486    vResult = _mm_min_ps(vResult,g_XMOne);
5487    // Scale by multiplication
5488    vResult = _mm_mul_ps(vResult,Scale);
5489    // Convert to int (W is unsigned)
5490    __m128i vResulti = _mm_cvtps_epi32(vResult);
5491    // Mask off any fraction
5492    vResulti = _mm_and_si128(vResulti,ScaleMask);
5493    // To fix W, add itself to shift it up to <<30 instead of <<29
5494    __m128i vResultw = _mm_and_si128(vResulti,g_XMMaskW);
5495    vResulti = _mm_add_epi32(vResulti,vResultw);
5496    // Do a horizontal or of all 4 entries
5497    vResult = _mm_shuffle_ps(reinterpret_cast<const __m128 *>(&vResulti)[0],reinterpret_cast<const __m128 *>(&vResulti)[0],_MM_SHUFFLE(0,3,2,1));
5498    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5499    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
5500    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5501    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(0,3,2,1));
5502    vResulti = _mm_or_si128(vResulti,reinterpret_cast<const __m128i *>(&vResult)[0]);
5503    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5504#else // _XM_VMX128_INTRINSICS_
5505#endif // _XM_VMX128_INTRINSICS_
5506}
5507
5508//------------------------------------------------------------------------------
5509
5510XMFINLINE VOID XMStoreXDec4
5511(
5512    XMXDEC4* pDestination,
5513    FXMVECTOR  V
5514)
5515{
5516#if defined(_XM_NO_INTRINSICS_)
5517
5518    XMVECTOR               N;
5519    static CONST XMVECTOR  Min = {-511.0f, -511.0f, -511.0f, 0.0f};
5520    static CONST XMVECTOR  Max = {511.0f, 511.0f, 511.0f, 3.0f};
5521
5522    XMASSERT(pDestination);
5523
5524    N = XMVectorClamp(V, Min, Max);
5525
5526    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5527                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5528                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5529                       (((INT)N.vector4_f32[0] & 0x3FF));
5530
5531#elif defined(_XM_SSE_INTRINSICS_)
5532    XMASSERT(pDestination);
5533    static const XMVECTORF32 MinXDec4 = {-511.0f,-511.0f,-511.0f, 0.0f};
5534    static const XMVECTORF32 MaxXDec4 = { 511.0f, 511.0f, 511.0f, 3.0f};
5535    static const XMVECTORF32 ScaleXDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
5536    static const XMVECTORI32 MaskXDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5537    // Clamp to bounds
5538    XMVECTOR vResult = _mm_max_ps(V,MinXDec4);
5539    vResult = _mm_min_ps(vResult,MaxXDec4);
5540    // Scale by multiplication
5541    vResult = _mm_mul_ps(vResult,ScaleXDec4);
5542    // Convert to int
5543    __m128i vResulti = _mm_cvttps_epi32(vResult);
5544    // Mask off any fraction
5545    vResulti = _mm_and_si128(vResulti,MaskXDec4);
5546    // Do a horizontal or of 4 entries
5547    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5548    // x = x|z, y = y|w
5549    vResulti = _mm_or_si128(vResulti,vResulti2);
5550    // Move Z to the x position
5551    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5552    // Perform a single bit left shift on y|w
5553    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5554    // i = x|y|z|w
5555    vResulti = _mm_or_si128(vResulti,vResulti2);
5556    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5557#else // _XM_VMX128_INTRINSICS_
5558#endif // _XM_VMX128_INTRINSICS_
5559}
5560
5561//------------------------------------------------------------------------------
5562
5563XMFINLINE VOID XMStoreUDecN4
5564(
5565    XMUDECN4* pDestination,
5566    FXMVECTOR  V
5567)
5568{
5569#if defined(_XM_NO_INTRINSICS_)
5570
5571    XMVECTOR               N;
5572    static CONST XMVECTORF32  Scale = {1023.0f, 1023.0f, 1023.0f, 3.0f};
5573
5574    XMASSERT(pDestination);
5575
5576    N = XMVectorSaturate(V);
5577    N = XMVectorMultiply(N, Scale.v);
5578
5579    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5580                       (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
5581                       (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
5582                       (((UINT)N.vector4_f32[0] & 0x3FF));
5583
5584#elif defined(_XM_SSE_INTRINSICS_)
5585    XMASSERT(pDestination);
5586    static const XMVECTORF32 ScaleUDecN4 = {1023.0f,1023.0f*1024.0f*0.5f,1023.0f*1024.0f*1024.0f,3.0f*1024.0f*1024.0f*1024.0f*0.5f};
5587    static const XMVECTORI32 MaskUDecN4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5588    // Clamp to bounds
5589    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5590    vResult = _mm_min_ps(vResult,g_XMOne);
5591    // Scale by multiplication
5592    vResult = _mm_mul_ps(vResult,ScaleUDecN4);
5593    // Convert to int
5594    __m128i vResulti = _mm_cvttps_epi32(vResult);
5595    // Mask off any fraction
5596    vResulti = _mm_and_si128(vResulti,MaskUDecN4);
5597    // Do a horizontal or of 4 entries
5598    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5599    // x = x|z, y = y|w
5600    vResulti = _mm_or_si128(vResulti,vResulti2);
5601    // Move Z to the x position
5602    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5603    // Perform a left shift by one bit on y|w
5604    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5605    // i = x|y|z|w
5606    vResulti = _mm_or_si128(vResulti,vResulti2);
5607    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5608#else // _XM_VMX128_INTRINSICS_
5609#endif // _XM_VMX128_INTRINSICS_
5610}
5611
5612//------------------------------------------------------------------------------
5613
5614XMFINLINE VOID XMStoreUDec4
5615(
5616    XMUDEC4* pDestination,
5617    FXMVECTOR  V
5618)
5619{
5620#if defined(_XM_NO_INTRINSICS_)
5621
5622    XMVECTOR               N;
5623    static CONST XMVECTOR  Max = {1023.0f, 1023.0f, 1023.0f, 3.0f};
5624
5625    XMASSERT(pDestination);
5626
5627    N = XMVectorClamp(V, XMVectorZero(), Max);
5628
5629    pDestination->v = ((UINT)N.vector4_f32[3] << 30) |
5630                       (((UINT)N.vector4_f32[2] & 0x3FF) << 20) |
5631                       (((UINT)N.vector4_f32[1] & 0x3FF) << 10) |
5632                       (((UINT)N.vector4_f32[0] & 0x3FF));
5633
5634#elif defined(_XM_SSE_INTRINSICS_)
5635    XMASSERT(pDestination);
5636    static const XMVECTORF32 MaxUDec4 = { 1023.0f, 1023.0f, 1023.0f, 3.0f};
5637    static const XMVECTORF32 ScaleUDec4 = {1.0f,1024.0f/2.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f/2.0f};
5638    static const XMVECTORI32 MaskUDec4= {0x3FF,0x3FF<<(10-1),0x3FF<<20,0x3<<(30-1)};
5639    // Clamp to bounds
5640    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5641    vResult = _mm_min_ps(vResult,MaxUDec4);
5642    // Scale by multiplication
5643    vResult = _mm_mul_ps(vResult,ScaleUDec4);
5644    // Convert to int
5645    __m128i vResulti = _mm_cvttps_epi32(vResult);
5646    // Mask off any fraction
5647    vResulti = _mm_and_si128(vResulti,MaskUDec4);
5648    // Do a horizontal or of 4 entries
5649    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5650    // x = x|z, y = y|w
5651    vResulti = _mm_or_si128(vResulti,vResulti2);
5652    // Move Z to the x position
5653    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5654    // Perform a left shift by one bit on y|w
5655    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5656    // i = x|y|z|w
5657    vResulti = _mm_or_si128(vResulti,vResulti2);
5658    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5659#else // _XM_VMX128_INTRINSICS_
5660#endif // _XM_VMX128_INTRINSICS_
5661}
5662
5663//------------------------------------------------------------------------------
5664
5665XMFINLINE VOID XMStoreDecN4
5666(
5667    XMDECN4* pDestination,
5668    FXMVECTOR V
5669)
5670{
5671#if defined(_XM_NO_INTRINSICS_)
5672
5673    XMVECTOR               N;
5674    static CONST XMVECTORF32  Scale = {511.0f, 511.0f, 511.0f, 1.0f};
5675
5676    XMASSERT(pDestination);
5677
5678    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5679    N = XMVectorMultiply(N, Scale.v);
5680
5681    pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5682                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5683                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5684                       (((INT)N.vector4_f32[0] & 0x3FF));
5685
5686#elif defined(_XM_SSE_INTRINSICS_)
5687    XMASSERT(pDestination);
5688    static const XMVECTORF32 ScaleDecN4 = {511.0f,511.0f*1024.0f,511.0f*1024.0f*1024.0f,1.0f*1024.0f*1024.0f*1024.0f};
5689    static const XMVECTORI32 MaskDecN4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5690    // Clamp to bounds
5691    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5692    vResult = _mm_min_ps(vResult,g_XMOne);
5693    // Scale by multiplication
5694    vResult = _mm_mul_ps(vResult,ScaleDecN4);
5695    // Convert to int
5696    __m128i vResulti = _mm_cvttps_epi32(vResult);
5697    // Mask off any fraction
5698    vResulti = _mm_and_si128(vResulti,MaskDecN4);
5699    // Do a horizontal or of 4 entries
5700    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5701    // x = x|z, y = y|w
5702    vResulti = _mm_or_si128(vResulti,vResulti2);
5703    // Move Z to the x position
5704    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5705    // i = x|y|z|w
5706    vResulti = _mm_or_si128(vResulti,vResulti2);
5707    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5708#else // _XM_VMX128_INTRINSICS_
5709#endif // _XM_VMX128_INTRINSICS_
5710}
5711
5712//------------------------------------------------------------------------------
5713
5714XMFINLINE VOID XMStoreDec4
5715(
5716    XMDEC4*  pDestination,
5717    FXMVECTOR V
5718)
5719{
5720#if defined(_XM_NO_INTRINSICS_)
5721
5722    XMVECTOR               N;
5723    static CONST XMVECTOR  Min = {-511.0f, -511.0f, -511.0f, -1.0f};
5724    static CONST XMVECTOR  Max = {511.0f, 511.0f, 511.0f, 1.0f};
5725
5726    XMASSERT(pDestination);
5727
5728    N = XMVectorClamp(V, Min, Max);
5729
5730    pDestination->v = ((INT)N.vector4_f32[3] << 30) |
5731                       (((INT)N.vector4_f32[2] & 0x3FF) << 20) |
5732                       (((INT)N.vector4_f32[1] & 0x3FF) << 10) |
5733                       (((INT)N.vector4_f32[0] & 0x3FF));
5734
5735#elif defined(_XM_SSE_INTRINSICS_)
5736    XMASSERT(pDestination);
5737    static const XMVECTORF32 MinDec4 = {-511.0f,-511.0f,-511.0f,-1.0f};
5738    static const XMVECTORF32 MaxDec4 = { 511.0f, 511.0f, 511.0f, 1.0f};
5739    static const XMVECTORF32 ScaleDec4 = {1.0f,1024.0f,1024.0f*1024.0f,1024.0f*1024.0f*1024.0f};
5740    static const XMVECTORI32 MaskDec4= {0x3FF,0x3FF<<10,0x3FF<<20,0x3<<30};
5741    // Clamp to bounds
5742    XMVECTOR vResult = _mm_max_ps(V,MinDec4);
5743    vResult = _mm_min_ps(vResult,MaxDec4);
5744    // Scale by multiplication
5745    vResult = _mm_mul_ps(vResult,ScaleDec4);
5746    // Convert to int
5747    __m128i vResulti = _mm_cvttps_epi32(vResult);
5748    // Mask off any fraction
5749    vResulti = _mm_and_si128(vResulti,MaskDec4);
5750    // Do a horizontal or of 4 entries
5751    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5752    // x = x|z, y = y|w
5753    vResulti = _mm_or_si128(vResulti,vResulti2);
5754    // Move Z to the x position
5755    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5756    // i = x|y|z|w
5757    vResulti = _mm_or_si128(vResulti,vResulti2);
5758    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5759#else // _XM_VMX128_INTRINSICS_
5760#endif // _XM_VMX128_INTRINSICS_
5761}
5762
5763//------------------------------------------------------------------------------
5764
5765XMFINLINE VOID XMStoreUByteN4
5766(
5767    XMUBYTEN4* pDestination,
5768    FXMVECTOR V
5769)
5770{
5771#if defined(_XM_NO_INTRINSICS_)
5772
5773    XMVECTOR               N;
5774    static CONST XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
5775
5776    XMASSERT(pDestination);
5777
5778    N = XMVectorSaturate(V);
5779    N = XMVectorMultiply(N, Scale.v);
5780    N = XMVectorRound(N);
5781
5782    pDestination->x = (BYTE)N.vector4_f32[0];
5783    pDestination->y = (BYTE)N.vector4_f32[1];
5784    pDestination->z = (BYTE)N.vector4_f32[2];
5785    pDestination->w = (BYTE)N.vector4_f32[3];
5786
5787#elif defined(_XM_SSE_INTRINSICS_)
5788    XMASSERT(pDestination);
5789    static const XMVECTORF32 ScaleUByteN4 = {255.0f,255.0f*256.0f*0.5f,255.0f*256.0f*256.0f,255.0f*256.0f*256.0f*256.0f*0.5f};
5790    static const XMVECTORI32 MaskUByteN4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5791    // Clamp to bounds
5792    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5793    vResult = _mm_min_ps(vResult,g_XMOne);
5794    // Scale by multiplication
5795    vResult = _mm_mul_ps(vResult,ScaleUByteN4);
5796    // Convert to int
5797    __m128i vResulti = _mm_cvttps_epi32(vResult);
5798    // Mask off any fraction
5799    vResulti = _mm_and_si128(vResulti,MaskUByteN4);
5800    // Do a horizontal or of 4 entries
5801    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5802    // x = x|z, y = y|w
5803    vResulti = _mm_or_si128(vResulti,vResulti2);
5804    // Move Z to the x position
5805    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5806    // Perform a single bit left shift to fix y|w
5807    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5808    // i = x|y|z|w
5809    vResulti = _mm_or_si128(vResulti,vResulti2);
5810    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5811#else // _XM_VMX128_INTRINSICS_
5812#endif // _XM_VMX128_INTRINSICS_
5813}
5814
5815//------------------------------------------------------------------------------
5816
5817XMFINLINE VOID XMStoreUByte4
5818(
5819    XMUBYTE4* pDestination,
5820    FXMVECTOR  V
5821)
5822{
5823#if defined(_XM_NO_INTRINSICS_)
5824
5825    XMVECTOR               N;
5826    static CONST XMVECTOR  Max = {255.0f, 255.0f, 255.0f, 255.0f};
5827
5828    XMASSERT(pDestination);
5829
5830    N = XMVectorClamp(V, XMVectorZero(), Max);
5831    N = XMVectorRound(N);
5832
5833    pDestination->x = (BYTE)N.vector4_f32[0];
5834    pDestination->y = (BYTE)N.vector4_f32[1];
5835    pDestination->z = (BYTE)N.vector4_f32[2];
5836    pDestination->w = (BYTE)N.vector4_f32[3];
5837
5838#elif defined(_XM_SSE_INTRINSICS_)
5839    XMASSERT(pDestination);
5840    static const XMVECTORF32 MaxUByte4 = { 255.0f, 255.0f, 255.0f, 255.0f};
5841    static const XMVECTORF32 ScaleUByte4 = {1.0f,256.0f*0.5f,256.0f*256.0f,256.0f*256.0f*256.0f*0.5f};
5842    static const XMVECTORI32 MaskUByte4 = {0xFF,0xFF<<(8-1),0xFF<<16,0xFF<<(24-1)};
5843    // Clamp to bounds
5844    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5845    vResult = _mm_min_ps(vResult,MaxUByte4);
5846    // Scale by multiplication
5847    vResult = _mm_mul_ps(vResult,ScaleUByte4);
5848    // Convert to int
5849    __m128i vResulti = _mm_cvttps_epi32(vResult);
5850    // Mask off any fraction
5851    vResulti = _mm_and_si128(vResulti,MaskUByte4);
5852    // Do a horizontal or of 4 entries
5853    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5854    // x = x|z, y = y|w
5855    vResulti = _mm_or_si128(vResulti,vResulti2);
5856    // Move Z to the x position
5857    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5858    // Perform a single bit left shift to fix y|w
5859    vResulti2 = _mm_add_epi32(vResulti2,vResulti2);
5860    // i = x|y|z|w
5861    vResulti = _mm_or_si128(vResulti,vResulti2);
5862    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5863#else // _XM_VMX128_INTRINSICS_
5864#endif // _XM_VMX128_INTRINSICS_
5865}
5866
5867//------------------------------------------------------------------------------
5868
5869XMFINLINE VOID XMStoreByteN4
5870(
5871    XMBYTEN4* pDestination,
5872    FXMVECTOR  V
5873)
5874{
5875#if defined(_XM_NO_INTRINSICS_)
5876
5877    XMVECTOR               N;
5878    static CONST XMVECTORF32  Scale = {127.0f, 127.0f, 127.0f, 127.0f};
5879
5880    XMASSERT(pDestination);
5881
5882    N = XMVectorClamp(V, g_XMNegativeOne.v, g_XMOne.v);
5883    N = XMVectorMultiply(V, Scale.v);
5884    N = XMVectorRound(N);
5885
5886    pDestination->x = (CHAR)N.vector4_f32[0];
5887    pDestination->y = (CHAR)N.vector4_f32[1];
5888    pDestination->z = (CHAR)N.vector4_f32[2];
5889    pDestination->w = (CHAR)N.vector4_f32[3];
5890
5891#elif defined(_XM_SSE_INTRINSICS_)
5892    XMASSERT(pDestination);
5893    static const XMVECTORF32 ScaleByteN4 = {127.0f,127.0f*256.0f,127.0f*256.0f*256.0f,127.0f*256.0f*256.0f*256.0f};
5894    static const XMVECTORI32 MaskByteN4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5895    // Clamp to bounds
5896    XMVECTOR vResult = _mm_max_ps(V,g_XMNegativeOne);
5897    vResult = _mm_min_ps(vResult,g_XMOne);
5898    // Scale by multiplication
5899    vResult = _mm_mul_ps(vResult,ScaleByteN4);
5900    // Convert to int
5901    __m128i vResulti = _mm_cvttps_epi32(vResult);
5902    // Mask off any fraction
5903    vResulti = _mm_and_si128(vResulti,MaskByteN4);
5904    // Do a horizontal or of 4 entries
5905    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5906    // x = x|z, y = y|w
5907    vResulti = _mm_or_si128(vResulti,vResulti2);
5908    // Move Z to the x position
5909    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5910    // i = x|y|z|w
5911    vResulti = _mm_or_si128(vResulti,vResulti2);
5912    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5913#else // _XM_VMX128_INTRINSICS_
5914#endif // _XM_VMX128_INTRINSICS_
5915}
5916
5917//------------------------------------------------------------------------------
5918
5919XMFINLINE VOID XMStoreByte4
5920(
5921    XMBYTE4*  pDestination,
5922    FXMVECTOR  V
5923)
5924{
5925#if defined(_XM_NO_INTRINSICS_)
5926
5927    XMVECTOR               N;
5928    static CONST XMVECTOR  Min = {-127.0f, -127.0f, -127.0f, -127.0f};
5929    static CONST XMVECTOR  Max = {127.0f, 127.0f, 127.0f, 127.0f};
5930
5931    XMASSERT(pDestination);
5932
5933    N = XMVectorClamp(V, Min, Max);
5934    N = XMVectorRound(N);
5935
5936    pDestination->x = (CHAR)N.vector4_f32[0];
5937    pDestination->y = (CHAR)N.vector4_f32[1];
5938    pDestination->z = (CHAR)N.vector4_f32[2];
5939    pDestination->w = (CHAR)N.vector4_f32[3];
5940
5941#elif defined(_XM_SSE_INTRINSICS_)
5942    XMASSERT(pDestination);
5943    static const XMVECTORF32 MinByte4 = {-127.0f,-127.0f,-127.0f,-127.0f};
5944    static const XMVECTORF32 MaxByte4 = { 127.0f, 127.0f, 127.0f, 127.0f};
5945    static const XMVECTORF32 ScaleByte4 = {1.0f,256.0f,256.0f*256.0f,256.0f*256.0f*256.0f};
5946    static const XMVECTORI32 MaskByte4 = {0xFF,0xFF<<8,0xFF<<16,0xFF<<24};
5947    // Clamp to bounds
5948    XMVECTOR vResult = _mm_max_ps(V,MinByte4);
5949    vResult = _mm_min_ps(vResult,MaxByte4);
5950    // Scale by multiplication
5951    vResult = _mm_mul_ps(vResult,ScaleByte4);
5952    // Convert to int
5953    __m128i vResulti = _mm_cvttps_epi32(vResult);
5954    // Mask off any fraction
5955    vResulti = _mm_and_si128(vResulti,MaskByte4);
5956    // Do a horizontal or of 4 entries
5957    __m128i vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(3,2,3,2));
5958    // x = x|z, y = y|w
5959    vResulti = _mm_or_si128(vResulti,vResulti2);
5960    // Move Z to the x position
5961    vResulti2 = _mm_shuffle_epi32(vResulti,_MM_SHUFFLE(1,1,1,1));
5962    // i = x|y|z|w
5963    vResulti = _mm_or_si128(vResulti,vResulti2);
5964    _mm_store_ss(reinterpret_cast<float *>(&pDestination->v),reinterpret_cast<const __m128 *>(&vResulti)[0]);
5965#else // _XM_VMX128_INTRINSICS_
5966#endif // _XM_VMX128_INTRINSICS_
5967}
5968
5969//------------------------------------------------------------------------------
5970
5971XMFINLINE VOID XMStoreUNibble4
5972(
5973     XMUNIBBLE4* pDestination,
5974     FXMVECTOR V
5975)
5976{
5977#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
5978    XMASSERT(pDestination);
5979    static CONST XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
5980    // Bounds check
5981    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
5982    vResult = _mm_min_ps(vResult,Max);
5983     // Convert to int with rounding
5984    __m128i vInt = _mm_cvtps_epi32(vResult);
5985    // No SSE operations will write to 16-bit values, so we have to extract them manually
5986    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
5987    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
5988    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
5989    USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
5990    pDestination->v = ((w & 0xF) << 12) |
5991                      ((z & 0xF) << 8) |
5992                      ((y & 0xF) << 4) |
5993                      ((x & 0xF));
5994#else
5995    XMVECTOR               N;
5996    static CONST XMVECTORF32  Max = {15.0f,15.0f,15.0f,15.0f};
5997
5998    XMASSERT(pDestination);
5999
6000    N = XMVectorClamp(V, XMVectorZero(), Max.v);
6001    N = XMVectorRound(N);
6002
6003    pDestination->v = (((USHORT)N.vector4_f32[3] & 0xF) << 12) |
6004                      (((USHORT)N.vector4_f32[2] & 0xF) << 8) |
6005                      (((USHORT)N.vector4_f32[1] & 0xF) << 4) |
6006                      (((USHORT)N.vector4_f32[0] & 0xF));
6007#endif !_XM_SSE_INTRINSICS_
6008}
6009
6010//------------------------------------------------------------------------------
6011
6012XMFINLINE VOID XMStoreU555(
6013     XMU555* pDestination,
6014     FXMVECTOR V
6015)
6016{
6017#if defined(_XM_SSE_INTRINSICS_) && !defined(_XM_NO_INTRINSICS_)
6018    XMASSERT(pDestination);
6019    static CONST XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
6020    // Bounds check
6021    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
6022    vResult = _mm_min_ps(vResult,Max);
6023     // Convert to int with rounding
6024    __m128i vInt = _mm_cvtps_epi32(vResult);
6025    // No SSE operations will write to 16-bit values, so we have to extract them manually
6026    USHORT x = static_cast<USHORT>(_mm_extract_epi16(vInt,0));
6027    USHORT y = static_cast<USHORT>(_mm_extract_epi16(vInt,2));
6028    USHORT z = static_cast<USHORT>(_mm_extract_epi16(vInt,4));
6029    USHORT w = static_cast<USHORT>(_mm_extract_epi16(vInt,6));
6030    pDestination->v = ((w) ? 0x8000 : 0) |
6031                      ((z & 0x1F) << 10) |
6032                      ((y & 0x1F) << 5) |
6033                      ((x & 0x1F));
6034#else
6035    XMVECTOR               N;
6036    static CONST XMVECTORF32  Max = {31.0f, 31.0f, 31.0f, 1.0f};
6037
6038    XMASSERT(pDestination);
6039
6040    N = XMVectorClamp(V, XMVectorZero(), Max.v);
6041    N = XMVectorRound(N);
6042
6043    pDestination->v = ((N.vector4_f32[3] > 0.f) ? 0x8000 : 0) |
6044                      (((USHORT)N.vector4_f32[2] & 0x1F) << 10) |
6045                      (((USHORT)N.vector4_f32[1] & 0x1F) << 5) |
6046                      (((USHORT)N.vector4_f32[0] & 0x1F));
6047#endif !_XM_SSE_INTRINSICS_
6048}
6049
6050//------------------------------------------------------------------------------
6051
6052XMFINLINE VOID XMStoreColor
6053(
6054    XMCOLOR* pDestination,
6055    FXMVECTOR V
6056)
6057{
6058#if defined(_XM_NO_INTRINSICS_)
6059
6060    XMVECTOR               N;
6061    static CONST XMVECTORF32  Scale = {255.0f, 255.0f, 255.0f, 255.0f};
6062
6063    XMASSERT(pDestination);
6064
6065    N = XMVectorSaturate(V);
6066    N = XMVectorMultiply(N, Scale.v);
6067    N = XMVectorRound(N);
6068
6069    pDestination->c = ((UINT)N.vector4_f32[3] << 24) |
6070                      ((UINT)N.vector4_f32[0] << 16) |
6071                      ((UINT)N.vector4_f32[1] <<  8) |
6072                      ((UINT)N.vector4_f32[2]);
6073
6074#elif defined(_XM_SSE_INTRINSICS_)
6075    XMASSERT(pDestination);
6076    static CONST XMVECTORF32  Scale = {255.0f,255.0f,255.0f,255.0f};
6077    // Set <0 to 0
6078    XMVECTOR vResult = _mm_max_ps(V,g_XMZero);
6079    // Set>1 to 1
6080    vResult = _mm_min_ps(vResult,g_XMOne);
6081    // Convert to 0-255
6082    vResult = _mm_mul_ps(vResult,Scale);
6083    // Shuffle RGBA to ARGB
6084    vResult = _mm_shuffle_ps(vResult,vResult,_MM_SHUFFLE(3,0,1,2));
6085    // Convert to int
6086    __m128i vInt = _mm_cvtps_epi32(vResult);
6087    // Mash to shorts
6088    vInt = _mm_packs_epi32(vInt,vInt);
6089    // Mash to bytes
6090    vInt = _mm_packus_epi16(vInt,vInt);
6091    // Store the color
6092    _mm_store_ss(reinterpret_cast<float *>(&pDestination->c),reinterpret_cast<__m128 *>(&vInt)[0]);
6093#else // _XM_VMX128_INTRINSICS_
6094#endif // _XM_VMX128_INTRINSICS_
6095}
6096
6097//------------------------------------------------------------------------------
6098
6099XMFINLINE VOID XMStoreFloat3x3
6100(
6101    XMFLOAT3X3*	pDestination,
6102    CXMMATRIX	M
6103)
6104{
6105#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
6106
6107    XMStoreFloat3x3NC(pDestination, M);
6108
6109#else // _XM_VMX128_INTRINSICS_
6110#endif // _XM_VMX128_INTRINSICS_
6111}
6112
6113//------------------------------------------------------------------------------
6114
6115XMFINLINE VOID XMStoreFloat3x3NC
6116(
6117    XMFLOAT3X3* pDestination,
6118    CXMMATRIX M
6119)
6120{
6121#if defined(_XM_NO_INTRINSICS_)
6122
6123    XMASSERT(pDestination);
6124
6125    pDestination->m[0][0] = M.r[0].vector4_f32[0];
6126    pDestination->m[0][1] = M.r[0].vector4_f32[1];
6127    pDestination->m[0][2] = M.r[0].vector4_f32[2];
6128
6129    pDestination->m[1][0] = M.r[1].vector4_f32[0];
6130    pDestination->m[1][1] = M.r[1].vector4_f32[1];
6131    pDestination->m[1][2] = M.r[1].vector4_f32[2];
6132
6133    pDestination->m[2][0] = M.r[2].vector4_f32[0];
6134    pDestination->m[2][1] = M.r[2].vector4_f32[1];
6135    pDestination->m[2][2] = M.r[2].vector4_f32[2];
6136
6137#elif defined(_XM_SSE_INTRINSICS_)
6138    XMASSERT(pDestination);
6139    XMVECTOR vTemp1 = M.r[0];
6140    XMVECTOR vTemp2 = M.r[1];
6141    XMVECTOR vTemp3 = M.r[2];
6142    XMVECTOR vWork = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,0,2,2));
6143    vTemp1 = _mm_shuffle_ps(vTemp1,vWork,_MM_SHUFFLE(2,0,1,0));
6144    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
6145    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6146    _mm_storeu_ps(&pDestination->m[1][1],vTemp2);
6147    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp3,_MM_SHUFFLE(2,2,2,2));
6148    _mm_store_ss(&pDestination->m[2][2],vTemp3);
6149#else // _XM_VMX128_INTRINSICS_
6150#endif // _XM_VMX128_INTRINSICS_
6151}
6152
6153//------------------------------------------------------------------------------
6154
6155XMFINLINE VOID XMStoreFloat4x3
6156(
6157    XMFLOAT4X3* pDestination,
6158    CXMMATRIX M
6159)
6160{
6161#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS) || defined(_XM_SSE_INTRINSICS_)
6162
6163    XMStoreFloat4x3NC(pDestination, M);
6164
6165#else // _XM_VMX128_INTRINSICS_
6166#endif // _XM_VMX128_INTRINSICS_
6167}
6168
6169//------------------------------------------------------------------------------
6170
6171XMFINLINE VOID XMStoreFloat4x3A
6172(
6173    XMFLOAT4X3A*	pDestination,
6174    CXMMATRIX		M
6175)
6176{
6177#if defined(_XM_NO_INTRINSICS_)
6178
6179    XMASSERT(pDestination);
6180    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6181
6182    pDestination->m[0][0] = M.r[0].vector4_f32[0];
6183    pDestination->m[0][1] = M.r[0].vector4_f32[1];
6184    pDestination->m[0][2] = M.r[0].vector4_f32[2];
6185
6186    pDestination->m[1][0] = M.r[1].vector4_f32[0];
6187    pDestination->m[1][1] = M.r[1].vector4_f32[1];
6188    pDestination->m[1][2] = M.r[1].vector4_f32[2];
6189
6190    pDestination->m[2][0] = M.r[2].vector4_f32[0];
6191    pDestination->m[2][1] = M.r[2].vector4_f32[1];
6192    pDestination->m[2][2] = M.r[2].vector4_f32[2];
6193
6194    pDestination->m[3][0] = M.r[3].vector4_f32[0];
6195    pDestination->m[3][1] = M.r[3].vector4_f32[1];
6196    pDestination->m[3][2] = M.r[3].vector4_f32[2];
6197
6198#elif defined(_XM_SSE_INTRINSICS_)
6199    XMASSERT(pDestination);
6200    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6201    // x1,y1,z1,w1
6202    XMVECTOR vTemp1 = M.r[0];
6203    // x2,y2,z2,w2
6204    XMVECTOR vTemp2 = M.r[1];
6205    // x3,y3,z3,w3
6206    XMVECTOR vTemp3 = M.r[2];
6207    // x4,y4,z4,w4
6208    XMVECTOR vTemp4 = M.r[3];
6209    // z1,z1,x2,y2
6210    XMVECTOR vTemp = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(1,0,2,2));
6211    // y2,z2,x3,y3 (Final)
6212    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6213    // x1,y1,z1,x2 (Final)
6214    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp,_MM_SHUFFLE(2,0,1,0));
6215    // z3,z3,x4,x4
6216    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
6217    // z3,x4,y4,z4 (Final)
6218    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
6219    // Store in 3 operations
6220    _mm_store_ps(&pDestination->m[0][0],vTemp1);
6221    _mm_store_ps(&pDestination->m[1][1],vTemp2);
6222    _mm_store_ps(&pDestination->m[2][2],vTemp3);
6223#else // _XM_VMX128_INTRINSICS_
6224#endif // _XM_VMX128_INTRINSICS_
6225}
6226
6227//------------------------------------------------------------------------------
6228
6229XMFINLINE VOID XMStoreFloat4x3NC
6230(
6231    XMFLOAT4X3* pDestination,
6232    CXMMATRIX M
6233)
6234{
6235#if defined(_XM_NO_INTRINSICS_)
6236
6237    XMASSERT(pDestination);
6238
6239    pDestination->m[0][0] = M.r[0].vector4_f32[0];
6240    pDestination->m[0][1] = M.r[0].vector4_f32[1];
6241    pDestination->m[0][2] = M.r[0].vector4_f32[2];
6242
6243    pDestination->m[1][0] = M.r[1].vector4_f32[0];
6244    pDestination->m[1][1] = M.r[1].vector4_f32[1];
6245    pDestination->m[1][2] = M.r[1].vector4_f32[2];
6246
6247    pDestination->m[2][0] = M.r[2].vector4_f32[0];
6248    pDestination->m[2][1] = M.r[2].vector4_f32[1];
6249    pDestination->m[2][2] = M.r[2].vector4_f32[2];
6250
6251    pDestination->m[3][0] = M.r[3].vector4_f32[0];
6252    pDestination->m[3][1] = M.r[3].vector4_f32[1];
6253    pDestination->m[3][2] = M.r[3].vector4_f32[2];
6254
6255#elif defined(_XM_SSE_INTRINSICS_)
6256    XMASSERT(pDestination);
6257    XMVECTOR vTemp1 = M.r[0];
6258    XMVECTOR vTemp2 = M.r[1];
6259    XMVECTOR vTemp3 = M.r[2];
6260    XMVECTOR vTemp4 = M.r[3];
6261    XMVECTOR vTemp2x = _mm_shuffle_ps(vTemp2,vTemp3,_MM_SHUFFLE(1,0,2,1));
6262    vTemp2 = _mm_shuffle_ps(vTemp2,vTemp1,_MM_SHUFFLE(2,2,0,0));
6263    vTemp1 = _mm_shuffle_ps(vTemp1,vTemp2,_MM_SHUFFLE(0,2,1,0));
6264    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(0,0,2,2));
6265    vTemp3 = _mm_shuffle_ps(vTemp3,vTemp4,_MM_SHUFFLE(2,1,2,0));
6266    _mm_storeu_ps(&pDestination->m[0][0],vTemp1);
6267    _mm_storeu_ps(&pDestination->m[1][1],vTemp2x);
6268    _mm_storeu_ps(&pDestination->m[2][2],vTemp3);
6269#else // _XM_VMX128_INTRINSICS_
6270#endif // _XM_VMX128_INTRINSICS_
6271}
6272
6273//------------------------------------------------------------------------------
6274
6275XMFINLINE VOID XMStoreFloat4x4
6276(
6277    XMFLOAT4X4* pDestination,
6278    CXMMATRIX M
6279)
6280{
6281#if defined(_XM_NO_INTRINSICS_) || defined(XM_NO_MISALIGNED_VECTOR_ACCESS)
6282
6283    XMStoreFloat4x4NC(pDestination, M);
6284
6285#elif defined(_XM_SSE_INTRINSICS_)
6286    XMASSERT(pDestination);
6287
6288    _mm_storeu_ps( &pDestination->_11, M.r[0] );
6289    _mm_storeu_ps( &pDestination->_21, M.r[1] );
6290    _mm_storeu_ps( &pDestination->_31, M.r[2] );
6291    _mm_storeu_ps( &pDestination->_41, M.r[3] );
6292#else // _XM_VMX128_INTRINSICS_
6293#endif // _XM_VMX128_INTRINSICS_
6294}
6295
6296//------------------------------------------------------------------------------
6297
6298XMFINLINE VOID XMStoreFloat4x4A
6299(
6300    XMFLOAT4X4A*	pDestination,
6301    CXMMATRIX		M
6302)
6303{
6304#if defined(_XM_NO_INTRINSICS_)
6305
6306    XMASSERT(pDestination);
6307    XMASSERT(((UINT_PTR)pDestination & 0xF) == 0);
6308
6309    pDestination->m[0][0] = M.r[0].vector4_f32[0];
6310    pDestination->m[0][1] = M.r[0].vector4_f32[1];
6311    pDestination->m[0][2] = M.r[0].vector4_f32[2];
6312    pDestination->m[0][3] = M.r[0].vector4_f32[3];
6313
6314    pDestination->m[1][0] = M.r[1].vector4_f32[0];
6315    pDestination->m[1][1] = M.r[1].vector4_f32[1];
6316    pDestination->m[1][2] = M.r[1].vector4_f32[2];
6317    pDestination->m[1][3] = M.r[1].vector4_f32[3];
6318
6319    pDestination->m[2][0] = M.r[2].vector4_f32[0];
6320    pDestination->m[2][1] = M.r[2].vector4_f32[1];
6321    pDestination->m[2][2] = M.r[2].vector4_f32[2];
6322    pDestination->m[2][3] = M.r[2].vector4_f32[3];
6323
6324    pDestination->m[3][0] = M.r[3].vector4_f32[0];
6325    pDestination->m[3][1] = M.r[3].vector4_f32[1];
6326    pDestination->m[3][2] = M.r[3].vector4_f32[2];
6327    pDestination->m[3][3] = M.r[3].vector4_f32[3];
6328
6329#elif defined(_XM_SSE_INTRINSICS_)
6330    XMASSERT(pDestination);
6331
6332    _mm_store_ps( &pDestination->_11, M.r[0] );
6333    _mm_store_ps( &pDestination->_21, M.r[1] );
6334    _mm_store_ps( &pDestination->_31, M.r[2] );
6335    _mm_store_ps( &pDestination->_41, M.r[3] );
6336#else // _XM_VMX128_INTRINSICS_
6337#endif // _XM_VMX128_INTRINSICS_
6338}
6339
6340//------------------------------------------------------------------------------
6341
6342XMFINLINE VOID XMStoreFloat4x4NC
6343(
6344    XMFLOAT4X4* pDestination,
6345    CXMMATRIX M
6346)
6347{
6348#if defined(_XM_NO_INTRINSICS_)
6349
6350    XMASSERT(pDestination);
6351
6352    pDestination->m[0][0] = M.r[0].vector4_f32[0];
6353    pDestination->m[0][1] = M.r[0].vector4_f32[1];
6354    pDestination->m[0][2] = M.r[0].vector4_f32[2];
6355    pDestination->m[0][3] = M.r[0].vector4_f32[3];
6356
6357    pDestination->m[1][0] = M.r[1].vector4_f32[0];
6358    pDestination->m[1][1] = M.r[1].vector4_f32[1];
6359    pDestination->m[1][2] = M.r[1].vector4_f32[2];
6360    pDestination->m[1][3] = M.r[1].vector4_f32[3];
6361
6362    pDestination->m[2][0] = M.r[2].vector4_f32[0];
6363    pDestination->m[2][1] = M.r[2].vector4_f32[1];
6364    pDestination->m[2][2] = M.r[2].vector4_f32[2];
6365    pDestination->m[2][3] = M.r[2].vector4_f32[3];
6366
6367    pDestination->m[3][0] = M.r[3].vector4_f32[0];
6368    pDestination->m[3][1] = M.r[3].vector4_f32[1];
6369    pDestination->m[3][2] = M.r[3].vector4_f32[2];
6370    pDestination->m[3][3] = M.r[3].vector4_f32[3];
6371
6372#elif defined(_XM_SSE_INTRINSICS_)
6373    XMASSERT(pDestination);
6374    _mm_storeu_ps(&pDestination->m[0][0],M.r[0]);
6375    _mm_storeu_ps(&pDestination->m[1][0],M.r[1]);
6376    _mm_storeu_ps(&pDestination->m[2][0],M.r[2]);
6377    _mm_storeu_ps(&pDestination->m[3][0],M.r[3]);
6378#else // _XM_VMX128_INTRINSICS_
6379#endif // _XM_VMX128_INTRINSICS_
6380}
6381
6382#endif // __XNAMATHCONVERT_INL__
6383
6384