1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4 (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6 
7 Copyright (c) 2000-2013 Torus Knot Software Ltd
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28 
29 #include "OgreStableHeaders.h"
30 
31 #include "OgreOptimisedUtil.h"
32 #include "OgrePlatformInformation.h"
33 
34 #if __OGRE_HAVE_DIRECTXMATH
35 
36 #include "OgreVector3.h"
37 #include "OgreMatrix4.h"
38 
39 #include <directxmath.h>
40 using namespace DirectX;
41 
42 // Use unrolled version when vertices exceed this limit
43 #define OGRE_DIRECTXMATH_SKINNING_UNROLL_VERTICES 16
44 
45 namespace Ogre {
46 
47 //-------------------------------------------------------------------------
48 // Local classes
49 //-------------------------------------------------------------------------
50 
51     /** General implementation of OptimisedUtil.
52     @note
53         Don't use this class directly, use OptimisedUtil instead.
54     */
55     class _OgrePrivate OptimisedUtilDirectXMath : public OptimisedUtil
56     {
57     public:
58         /// @copydoc OptimisedUtil::softwareVertexSkinning
59         virtual void softwareVertexSkinning(
60             const float *srcPosPtr, float *destPosPtr,
61             const float *srcNormPtr, float *destNormPtr,
62             const float *blendWeightPtr, const unsigned char* blendIndexPtr,
63             const Matrix4* const* blendMatrices,
64             size_t srcPosStride, size_t destPosStride,
65             size_t srcNormStride, size_t destNormStride,
66             size_t blendWeightStride, size_t blendIndexStride,
67             size_t numWeightsPerVertex,
68             size_t numVertices);
69 
70         /// @copydoc OptimisedUtil::softwareVertexMorph
71         virtual void softwareVertexMorph(
72             Real t,
73             const float *srcPos1, const float *srcPos2,
74             float *dstPos,
75 			size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
76             size_t numVertices,
77 			bool morphNormals);
78 
79         /// @copydoc OptimisedUtil::concatenateAffineMatrices
80         virtual void concatenateAffineMatrices(
81             const Matrix4& baseMatrix,
82             const Matrix4* srcMatrices,
83             Matrix4* dstMatrices,
84             size_t numMatrices);
85 
86         /// @copydoc OptimisedUtil::calculateFaceNormals
87         virtual void calculateFaceNormals(
88             const float *positions,
89             const EdgeData::Triangle *triangles,
90             Vector4 *faceNormals,
91             size_t numTriangles);
92 
93         /// @copydoc OptimisedUtil::calculateLightFacing
94         virtual void calculateLightFacing(
95             const Vector4& lightPos,
96             const Vector4* faceNormals,
97             char* lightFacings,
98             size_t numFaces);
99 
100         /// @copydoc OptimisedUtil::extrudeVertices
101         virtual void extrudeVertices(
102             const Vector4& lightPos,
103             Real extrudeDist,
104             const float* srcPositions,
105             float* destPositions,
106             size_t numVertices);
107     };
108 
109 //---------------------------------------------------------------------
110 // DirectXMath helpers.
111 //---------------------------------------------------------------------
112 
113 /** Check whether or not the given pointer perfect aligned for DirectXMath.
114 */
_isAlignedForDirectXMath(const void * p)115 static FORCEINLINE bool _isAlignedForDirectXMath(const void *p)
116 {
117     return (((size_t)p) & 15) == 0;
118 }
119 
120 /// Linear interpolation
121 #define __DX_LERP_PS(t, a, b)                                                       \
122     XMVectorLerpV(a, b, t)
123 
124 /// Linear interpolation.  Single value lerp is not supported in DirectXMath, fallback to __DX_LERP_PS.
125 #define __DX_LERP_SS(t, a, b)                                                       \
126     __DX_LERP_PS(t, a, b)
127 
128 #define __DX_LOAD_PS(p)                                                             \
129     (*(XMVECTOR*)(p))
130 
131 #define __DX_STORE_PS(p, v)                                                         \
132     (*(XMVECTOR*)(p) = (v))
133 
134 /// Accumulate three vector of single precision floating point values.
135 #define __DX_ACCUM3_PS(a, b, c)                                                     \
136     XMVectorAdd(XMVectorAdd(a, b), c)
137 
138 /// Accumulate four vector of single precision floating point values.
139 #define __DX_ACCUM4_PS(a, b, c, d)                                                  \
140     XMVectorAdd(XMVectorAdd(a, b), XMVectorAdd(c, d))
141 
142 /** Performing dot-product between two of three vector of single precision
143     floating point values.
144 */
145 #define __DX_DOT3x3_PS(r0, r1, r2, v0, v1, v2)                                      \
146     __DX_ACCUM3_PS(XMVectorMultiply(r0, v0), XMVectorMultiply(r1, v1), XMVectorMultiply(r2, v2))
147 
148 /** Performing dot-product between four vector and three vector of single
149     precision floating point values.
150 */
151 #define __DX_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2)                                  \
152     __DX_ACCUM4_PS(XMVectorMultiply(r0, v0), XMVectorMultiply(r1, v1), XMVectorMultiply(r2, v2), r3)
153 
154 /** Performing the transpose of a 4x4 matrix of single precision floating
155     point values.
156     Arguments r0, r1, r2, and r3 are XMVECTOR values whose elements
157     form the corresponding rows of a 4x4 matrix.
158     The matrix transpose is returned in arguments r0, r1, r2, and
159     r3 where r0 now holds column 0 of the original matrix, r1 now
160     holds column 1 of the original matrix, etc.
161 */
162 #define __DX_TRANSPOSE4x4_PS(r0, r1, r2, r3)                                            \
163     {                                                                                   \
164         XMVECTOR tmp3, tmp2, tmp1, tmp0;                                                \
165                                                                                         \
166                                                             /* r00 r01 r02 r03 */       \
167                                                             /* r10 r11 r12 r13 */       \
168                                                             /* r20 r21 r22 r23 */       \
169                                                             /* r30 r31 r32 r33 */       \
170                                                                                         \
171         tmp0 = XMVectorMergeXY(r0, r1);                       /* r00 r10 r01 r11 */     \
172         tmp2 = XMVectorMergeZW(r0, r1);                       /* r02 r12 r03 r13 */     \
173         tmp1 = XMVectorMergeXY(r2, r3);                       /* r20 r30 r21 r31 */     \
174         tmp3 = XMVectorMergeZW(r2, r3);                       /* r22 r32 r23 r33 */     \
175                                                                                         \
176         r0 = XMVectorPermute<0, 1, 4, 5>(tmp0, tmp1);           /* r00 r10 r20 r30 */   \
177         r1 = XMVectorPermute<6, 7, 2, 3>(tmp1, tmp0);           /* r01 r11 r21 r31 */   \
178         r2 = XMVectorPermute<0, 1, 4, 5>(tmp2, tmp3);           /* r02 r12 r22 r32 */   \
179         r3 = XMVectorPermute<6, 7, 2, 3>(tmp3, tmp2);           /* r03 r13 r23 r33 */   \
180     }
181 
182 /** Performing the transpose of a continuous stored rows of a 4x3 matrix to
183     a 3x4 matrix of single precision floating point values.
184     Arguments v0, v1, and v2 are XMVECTOR values whose elements form the
185     corresponding continuous stored rows of a 4x3 matrix.
186     The matrix transpose is returned in arguments v0, v1, and v2, where
187     v0 now holds column 0 of the original matrix, v1 now holds column 1
188     of the original matrix, etc.
189 */
190 #define __DX_TRANSPOSE4x3_PS(v0, v1, v2)                                                \
191     {                                                                                   \
192         XMVECTOR tmp0, tmp1, tmp2;                                                      \
193                                                                                         \
194                                                             /* r00 r01 r02 r10 */       \
195                                                             /* r11 r12 r20 r21 */       \
196                                                             /* r22 r30 r31 r32 */       \
197                                                                                         \
198         tmp0 = XMVectorPermute<0, 3, 4, 7>(v0, v2);         /* r00 r10 r22 r32 */       \
199         tmp1 = XMVectorPermute<1, 2, 4, 5>(v0, v1);         /* r01 r02 r11 r12 */       \
200         tmp2 = XMVectorPermute<2, 3, 5, 6>(v1, v2);         /* r20 r21 r30 r31 */       \
201                                                                                         \
202         v0 = XMVectorPermute<0, 1, 4, 6>(tmp0, tmp2);       /* r00 r10 r20 r30 */       \
203         v1 = XMVectorPermute<0, 2, 5, 7>(tmp1, tmp2);       /* r01 r11 r21 r31 */       \
204         v2 = XMVectorPermute<1, 3, 6, 7>(tmp1, tmp0);       /* r02 r12 r22 r32 */       \
205     }
206 
207 /** Performing the transpose of a 3x4 matrix to a continuous stored rows of
208     a 4x3 matrix of single precision floating point values.
209     Arguments v0, v1, and v2 are XMVECTOR values whose elements form the
210     corresponding columns of a 3x4 matrix.
211     The matrix transpose is returned in arguments v0, v1, and v2, as a
212     continuous stored rows of a 4x3 matrix.
213 */
214 #define __DX_TRANSPOSE3x4_PS(v0, v1, v2)                                            \
215     {                                                                               \
216         XMVECTOR tmp0, tmp1, tmp2;                                                  \
217                                                                                     \
218                                                             /* r00 r10 r20 r30 */   \
219                                                             /* r01 r11 r21 r31 */   \
220                                                             /* r02 r12 r22 r32 */   \
221                                                                                     \
222         tmp0 = XMVectorPermute<1, 3, 4, 6>(v0, v2);         /* r10 r30 r02 r22 */   \
223         tmp1 = XMVectorPermute<1, 3, 5, 7>(v1, v2);         /* r11 r31 r12 r32 */   \
224         tmp2 = XMVectorPermute<0, 2, 4, 6>(v0, v1);         /* r00 r20 r01 r21 */   \
225                                                                                     \
226         v0 = XMVectorPermute<0, 2, 6, 4>(tmp2, tmp0);       /* r00 r01 r02 r10 */   \
227         v1 = XMVectorPermute<0, 2, 5, 7>(tmp1, tmp2);       /* r11 r12 r20 r21 */   \
228         v2 = XMVectorPermute<3, 1, 5, 7>(tmp0, tmp1);       /* r22 r30 r31 r32 */   \
229     }
230 
231     /** Helper to load/store DirectXMath data based on whether or not aligned.
232     */
233     template <bool aligned = false>
234     struct DirectXMathMemoryAccessor
235     {
loadOgre::DirectXMathMemoryAccessor236         static FORCEINLINE XMVECTOR load(const float *p)
237         {
238             return XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(p));
239         }
storeOgre::DirectXMathMemoryAccessor240         static FORCEINLINE void store(float *p, const XMVECTOR& v)
241         {
242             XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(p), v);
243         }
244     };
245     // Special aligned accessor
246     template <>
247     struct DirectXMathMemoryAccessor<true>
248     {
loadOgre::DirectXMathMemoryAccessor249         static FORCEINLINE const XMVECTOR load(const float *p)
250         {
251             return __DX_LOAD_PS(p);
252         }
storeOgre::DirectXMathMemoryAccessor253         static FORCEINLINE void store(float *p, const XMVECTOR& v)
254         {
255             __DX_STORE_PS(p, v);
256         }
257     };
258 
259 //---------------------------------------------------------------------
260 // Some useful macro for collapse matrices.
261 //---------------------------------------------------------------------
262 
263 #define __LOAD_MATRIX(row0, row1, row2, pMatrix)                        \
264     {                                                                   \
265         row0 = __DX_LOAD_PS((*pMatrix)[0]);                             \
266         row1 = __DX_LOAD_PS((*pMatrix)[1]);                              \
267         row2 = __DX_LOAD_PS((*pMatrix)[2]);                              \
268     }
269 
270 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix)                \
271     {                                                                   \
272         row0 = XMVectorLerpV(row0, __DX_LOAD_PS((*pMatrix)[0]), weight);\
273         row1 = XMVectorLerpV(row1, __DX_LOAD_PS((*pMatrix)[1]), weight);\
274         row2 = XMVectorLerpV(row2, __DX_LOAD_PS((*pMatrix)[2]), weight);\
275     }
276 
277 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)       \
278     {                                                                   \
279         row0 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[0]), weight);   \
280         row1 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[1]), weight);   \
281         row2 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[2]), weight);   \
282     }
283 
284 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)      \
285     {                                                                   \
286         row0 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[0]), weight, row0); \
287         row1 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[1]), weight, row1); \
288         row2 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[2]), weight, row2); \
289     }
290 
291 //---------------------------------------------------------------------
292 // The following macros request variables declared by caller.
293 //
294 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
295 //---------------------------------------------------------------------
296 
297 /** Collapse one-weighted matrix.
298     Eliminated multiply by weight since the weight should be equal to one always
299 */
300 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
301     {                                                                           \
302         pMatrix0 = blendMatrices[pIndices[0]];                                  \
303         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
304     }
305 
306 /** Collapse two-weighted matrix.
307     Based on the fact that accumulated weights are equal to one, by use lerp,
308     replaced two multiplies and one additive with one multiplie and two additives.
309 */
310 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
311     {                                                                           \
312         weight = XMVectorReplicatePtr(pWeights + 1);                            \
313         pMatrix0 = ppMatrices[pIndices[0]];                                     \
314         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
315         pMatrix1 = ppMatrices[pIndices[1]];                                     \
316         __LERP_MATRIX(row0, row1, row2, weight, pMatrix1);                      \
317     }
318 
319 /** Collapse three-weighted matrix.
320 */
321 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
322     {                                                                           \
323         weight = XMVectorReplicatePtr(pWeights + 0);                            \
324         pMatrix0 = ppMatrices[pIndices[0]];                                     \
325         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
326         weight = XMVectorReplicatePtr(pWeights + 1);                            \
327         pMatrix1 = ppMatrices[pIndices[1]];                                     \
328         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
329         weight = XMVectorReplicatePtr(pWeights + 2);                            \
330         pMatrix2 = ppMatrices[pIndices[2]];                                     \
331         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
332     }
333 
334 /** Collapse four-weighted matrix.
335 */
336 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
337     {                                                                           \
338         /* Load four blend weights at one time, they will be shuffled later */  \
339         weights = __DX_LOAD_PS(pWeights);                                       \
340                                                                                 \
341         pMatrix0 = ppMatrices[pIndices[0]];                                     \
342         weight = XMVectorSplatX(weights);                                       \
343         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
344         pMatrix1 = ppMatrices[pIndices[1]];                                     \
345         weight = XMVectorSplatY(weights);                                       \
346         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
347         pMatrix2 = ppMatrices[pIndices[2]];                                     \
348         weight = XMVectorSplatZ(weights);                                       \
349         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
350         pMatrix3 = ppMatrices[pIndices[3]];                                     \
351         weight = XMVectorSplatW(weights);                                       \
352         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3);            \
353     }
354 
355 
356 
357     //---------------------------------------------------------------------
358     // Collapse a matrix at one time. The collapsed matrix are weighted by
359     // blend-weights, and then can use to transform corresponding vertex directly.
360     //
361     // I'd like use inline function instead of macro here, but I also want to
362     // ensure compiler integrate this code into its callers (release build at
363     // least), doesn't matter about specific compile options. Inline function
364     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
365     // when implemented as inline function, even if compile with "-O3" option.
366     //
367 #define _collapseOneMatrix(                                                     \
368         m00, m01, m02,                                                          \
369         pBlendWeight, pBlendIndex,                                              \
370         blendMatrices,                                                          \
371         blendWeightStride, blendIndexStride,                                    \
372         numWeightsPerVertex)                                                    \
373     {                                                                           \
374         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
375         /* generate wrong code here!!!                                   */     \
376         const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
377         XMVECTOR weight, weights;                                               \
378                                                                                 \
379         switch (numWeightsPerVertex)                                            \
380         {                                                                       \
381         default:    /* Just in case and make compiler happy */                  \
382         case 1:                                                                 \
383             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
384                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
385                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
386             break;                                                              \
387                                                                                 \
388         case 2:                                                                 \
389             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
390                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
391                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
392             break;                                                              \
393                                                                                 \
394         case 3:                                                                 \
395             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
396                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
397                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
398             break;                                                              \
399                                                                                 \
400         case 4:                                                                 \
401             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
402                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
403                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
404             break;                                                              \
405         }                                                                       \
406     }
407 
408     //---------------------------------------------------------------------
409     // Collapse four matrices at one time. The collapsed matrix are weighted by
410     // blend-weights, and then can use to transform corresponding vertex directly.
411     //
412     // I'd like use inline function instead of macro here, but I also want to
413     // ensure compiler integrate this code into its callers (release build at
414     // least), doesn't matter about specific compile options. Inline function
415     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
416     // when implemented as inline function, even if compile with "-O3" option.
417     //
418 #define _collapseFourMatrices(                                                  \
419         m00, m01, m02,                                                          \
420         m10, m11, m12,                                                          \
421         m20, m21, m22,                                                          \
422         m30, m31, m32,                                                          \
423         pBlendWeight, pBlendIndex,                                              \
424         blendMatrices,                                                          \
425         blendWeightStride, blendIndexStride,                                    \
426         numWeightsPerVertex)                                                    \
427     {                                                                           \
428         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
429         /* generate wrong code here!!!                                   */     \
430         const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
431         XMVECTOR weight, weights;                                                 \
432                                                                                 \
433         switch (numWeightsPerVertex)                                            \
434         {                                                                       \
435         default:    /* Just in case and make compiler happy */                  \
436         case 1:                                                                 \
437             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
438                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
439                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
440             __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices,                  \
441                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
442                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
443             __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices,                  \
444                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
445                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
446             __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices,                  \
447                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
448                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
449             break;                                                              \
450                                                                                 \
451         case 2:                                                                 \
452             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
453                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
454                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
455             __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices,                  \
456                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
457                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
458             __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices,                  \
459                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
460                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
461             __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices,                  \
462                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
463                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
464             break;                                                              \
465                                                                                 \
466         case 3:                                                                 \
467             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
468                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
469                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
470             __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices,                  \
471                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
472                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
473             __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices,                  \
474                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
475                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
476             __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices,                  \
477                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
478                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
479             break;                                                              \
480                                                                                 \
481         case 4:                                                                 \
482             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
483                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
484                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
485             __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices,                  \
486                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
487                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
488             __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices,                  \
489                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
490                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
491             __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices,                  \
492                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
493                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
494             break;                                                              \
495         }                                                                       \
496     }
497 
498 
499 
500     //---------------------------------------------------------------------
501     // General DirectXMath version skinning positions, and optional skinning normals.
softwareVertexSkinning_DirectXMath_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)502     static void softwareVertexSkinning_DirectXMath_General(
503         const float *pSrcPos, float *pDestPos,
504         const float *pSrcNorm, float *pDestNorm,
505         const float *pBlendWeight, const unsigned char* pBlendIndex,
506         const Matrix4* const* blendMatrices,
507         size_t srcPosStride, size_t destPosStride,
508         size_t srcNormStride, size_t destNormStride,
509         size_t blendWeightStride, size_t blendIndexStride,
510         size_t numWeightsPerVertex,
511         size_t numVertices)
512     {
513         for (size_t i = 0; i < numVertices; ++i)
514         {
515             // Collapse matrices
516             XMVECTOR m00, m01, m02;
517             _collapseOneMatrix(
518                 m00, m01, m02,
519                 pBlendWeight, pBlendIndex,
520                 blendMatrices,
521                 blendWeightStride, blendIndexStride,
522                 numWeightsPerVertex);
523 
524             // Advance blend weight and index pointers
525             advanceRawPointer(pBlendWeight, blendWeightStride);
526             advanceRawPointer(pBlendIndex, blendIndexStride);
527 
528             //------------------------------------------------------------------
529 
530             XMVECTOR m03 = g_XMZero;
531             __DX_TRANSPOSE4x4_PS(m00, m01, m02, m03);
532 
533             //------------------------------------------------------------------
534             // Transform position
535             //------------------------------------------------------------------
536 
537             XMVECTOR s0, s1, s2;
538 
539             // Load source position
540             s0 = XMVectorReplicatePtr(pSrcPos + 0);
541             s1 = XMVectorReplicatePtr(pSrcPos + 1);
542             s2 = XMVectorReplicatePtr(pSrcPos + 2);
543 
544             // Transform by collapsed matrix
545             XMVECTOR accumPos = __DX_DOT4x3_PS(m00, m01, m02, m03, s0, s1, s2);   // x y z 0
546 
547             // Store blended position, no aligned requirement
548             XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), accumPos);
549 
550             // Advance source and target position pointers
551             advanceRawPointer(pSrcPos, srcPosStride);
552             advanceRawPointer(pDestPos, destPosStride);
553 
554             //------------------------------------------------------------------
555             // Optional blend normal
556             //------------------------------------------------------------------
557 
558             if (pSrcNorm)
559             {
560                 // Load source normal
561                 s0 = XMVectorReplicatePtr(pSrcNorm + 0);
562                 s1 = XMVectorReplicatePtr(pSrcNorm + 1);
563                 s2 = XMVectorReplicatePtr(pSrcNorm + 2);
564 
565                 // Transform by collapsed matrix
566                 XMVECTOR accumNorm = __DX_DOT3x3_PS(m00, m01, m02, s0, s1, s2);   // x y z 0
567 
568                 // Normalise normal
569                 accumNorm = XMVector3Normalize(accumNorm);
570 
571                 // Store blended normal, no aligned requirement
572                 XMStoreFloat3((XMFLOAT3*)(pDestNorm + 0), accumNorm);
573 
574                 // Advance source and target normal pointers
575                 advanceRawPointer(pSrcNorm, srcNormStride);
576                 advanceRawPointer(pDestNorm, destNormStride);
577             }
578         }
579     }
580     //---------------------------------------------------------------------
581     // Special DirectXMath version skinning shared buffers of position and normal,
582     // and the buffer are packed.
583     template <bool srcAligned, bool destAligned>
584     struct SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed
585     {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed586         static void apply(
587             const float* pSrc, float* pDest,
588             const float* pBlendWeight, const unsigned char* pBlendIndex,
589             const Matrix4* const* blendMatrices,
590             size_t blendWeightStride, size_t blendIndexStride,
591             size_t numWeightsPerVertex,
592             size_t numIterations)
593         {
594             typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
595             typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
596 
597             // Blending 4 vertices per-iteration
598             for (size_t i = 0; i < numIterations; ++i)
599             {
600                 // Collapse matrices
601                 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
602                 _collapseFourMatrices(
603                     m00, m01, m02,
604                     m10, m11, m12,
605                     m20, m21, m22,
606                     m30, m31, m32,
607                     pBlendWeight, pBlendIndex,
608                     blendMatrices,
609                     blendWeightStride, blendIndexStride,
610                     numWeightsPerVertex);
611 
612                 // Advance 4 vertices
613                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
614                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
615 
616                 //------------------------------------------------------------------
617                 // Transform position/normals
618                 //------------------------------------------------------------------
619 
620                 XMVECTOR s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
621                 XMVECTOR t0, t1, t2, t3, t4, t5;
622 
623                 // Load source position/normals
624                 s0 = SrcAccessor::load(pSrc + 0);                       // px0 py0 pz0 nx0
625                 s1 = SrcAccessor::load(pSrc + 4);                       // ny0 nz0 px1 py1
626                 s2 = SrcAccessor::load(pSrc + 8);                       // pz1 nx1 ny1 nz1
627                 s3 = SrcAccessor::load(pSrc + 12);                      // px2 py2 pz2 nx2
628                 s4 = SrcAccessor::load(pSrc + 16);                      // ny2 nz2 px3 py3
629                 s5 = SrcAccessor::load(pSrc + 20);                      // pz3 nx3 ny3 nz3
630 
631                 // Rearrange to component-major for batches calculate.
632                 t0 = XMVectorMergeXY(s0, s3);                           // px0 px2 py0 py2
633                 t1 = XMVectorMergeZW(s0, s3);                           // pz0 pz2 nx0 nx2
634                 t2 = XMVectorMergeXY(s1, s4);                           // ny0 ny2 nz0 nz2
635                 t3 = XMVectorMergeZW(s1, s4);                           // px1 px3 py1 py3
636                 t4 = XMVectorMergeXY(s2, s5);                           // pz1 pz3 nx1 nx3
637                 t5 = XMVectorMergeZW(s2, s5);                           // ny1 ny3 nz1 nz3
638 
639                 s0 = XMVectorMergeXY(t0, t3);                           // px0 px1 px2 px3
640                 s1 = XMVectorMergeZW(t0, t3);                           // py0 py1 py2 py3
641                 s2 = XMVectorMergeXY(t1, t4);                           // pz0 pz1 pz2 pz3
642                 s3 = XMVectorMergeZW(t1, t4);                           // nx0 nx1 nx2 nx3
643                 s4 = XMVectorMergeXY(t2, t5);                           // ny0 ny1 ny2 ny3
644                 s5 = XMVectorMergeZW(t2, t5);                           // nz0 nz1 nz2 nz3
645 
646                 // Transform by collapsed matrix
647 
648                 // Shuffle row 0 of four collapsed matrices for calculate X component
649                 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
650 
651                 // Transform X components
652                 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // PX0 PX1 PX2 PX3
653                 d3 = __DX_DOT3x3_PS(m00, m10, m20, s3, s4, s5);         // NX0 NX1 NX2 NX3
654 
655                 // Shuffle row 1 of four collapsed matrices for calculate Y component
656                 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
657 
658                 // Transform Y components
659                 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // PY0 PY1 PY2 PY3
660                 d4 = __DX_DOT3x3_PS(m01, m11, m21, s3, s4, s5);         // NY0 NY1 NY2 NY3
661 
662                 // Shuffle row 2 of four collapsed matrices for calculate Z component
663                 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
664 
665                 // Transform Z components
666                 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // PZ0 PZ1 PZ2 PZ3
667                 d5 = __DX_DOT3x3_PS(m02, m12, m22, s3, s4, s5);         // NZ0 NZ1 NZ2 NZ3
668 
669                 // Normalise normals
670                 XMVECTOR tmp = __DX_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
671                 tmp = XMVectorReciprocalSqrtEst(tmp);
672                 d3 = XMVectorMultiply(d3, tmp);
673                 d4 = XMVectorMultiply(d4, tmp);
674                 d5 = XMVectorMultiply(d5, tmp);
675 
676                 // Arrange back to continuous format for store results
677 
678                 t0 = XMVectorMergeXY(d0, d1); // PX0 PY0 PX1 PY1
679                 t1 = XMVectorMergeZW(d0, d1); // PX2 PY2 PX3 PY3
680                 t2 = XMVectorMergeXY(d2, d3); // PZ0 NX0 PZ1 NX1
681                 t3 = XMVectorMergeZW(d2, d3); // PZ2 NX2 PZ3 NX3
682                 t4 = XMVectorMergeXY(d4, d5); // NY0 NZ0 NY1 NZ1
683                 t5 = XMVectorMergeZW(d4, d5); // NY2 NZ2 NY3 NZ3
684 
685                 d0 = XMVectorPermute<0, 1, 4, 5>(t0, t2); // PX0 PY0 PZ0 NX0
686                 d1 = XMVectorPermute<0, 1, 6, 7>(t4, t0); // NY0 NZ0 PX1 PY1
687                 d2 = XMVectorPermute<6, 7, 2, 3>(t4, t2); // PZ1 NX1 NY1 NZ1
688                 d3 = XMVectorPermute<0, 1, 4, 5>(t1, t3); // PX2 PY2 PZ2 NX2
689                 d4 = XMVectorPermute<0, 1, 6, 7>(t5, t1); // NY2 NZ2 PX3 PY3
690                 d5 = XMVectorPermute<6, 7, 2, 3>(t5, t3); // PZ3 NX3 NY3 NZ3
691 
692                 // Store blended position/normals
693                 DestAccessor::store(pDest + 0, d0);
694                 DestAccessor::store(pDest + 4, d1);
695                 DestAccessor::store(pDest + 8, d2);
696                 DestAccessor::store(pDest + 12, d3);
697                 DestAccessor::store(pDest + 16, d4);
698                 DestAccessor::store(pDest + 20, d5);
699 
700                 // Advance 4 vertices
701                 pSrc += 4 * (3 + 3);
702                 pDest += 4 * (3 + 3);
703             }
704         }
705     };
softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)706     static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(
707             const float* pSrcPos, float* pDestPos,
708             const float* pBlendWeight, const unsigned char* pBlendIndex,
709             const Matrix4* const* blendMatrices,
710             size_t blendWeightStride, size_t blendIndexStride,
711             size_t numWeightsPerVertex,
712             size_t numIterations)
713     {
714         // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
715 
716         // Instantiating two version only, since other alignment combinations are not that important.
717         if (_isAlignedForDirectXMath(pSrcPos) && _isAlignedForDirectXMath(pDestPos))
718         {
719             SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed<true, true>::apply(
720                 pSrcPos, pDestPos,
721                 pBlendWeight, pBlendIndex,
722                 blendMatrices,
723                 blendWeightStride, blendIndexStride,
724                 numWeightsPerVertex,
725                 numIterations);
726         }
727         else
728         {
729             SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed<false, false>::apply(
730                 pSrcPos, pDestPos,
731                 pBlendWeight, pBlendIndex,
732                 blendMatrices,
733                 blendWeightStride, blendIndexStride,
734                 numWeightsPerVertex,
735                 numIterations);
736         }
737     }
738     //---------------------------------------------------------------------
739     // Special DirectXMath version skinning separated buffers of position and normal,
740     // both of position and normal buffer are packed.
741     template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
742     struct SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed
743     {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed744         static void apply(
745             const float* pSrcPos, float* pDestPos,
746             const float* pSrcNorm, float* pDestNorm,
747             const float* pBlendWeight, const unsigned char* pBlendIndex,
748             const Matrix4* const* blendMatrices,
749             size_t blendWeightStride, size_t blendIndexStride,
750             size_t numWeightsPerVertex,
751             size_t numIterations)
752         {
753             typedef DirectXMathMemoryAccessor<srcPosAligned> SrcPosAccessor;
754             typedef DirectXMathMemoryAccessor<destPosAligned> DestPosAccessor;
755             typedef DirectXMathMemoryAccessor<srcNormAligned> SrcNormAccessor;
756             typedef DirectXMathMemoryAccessor<destNormAligned> DestNormAccessor;
757 
758             // Blending 4 vertices per-iteration
759             for (size_t i = 0; i < numIterations; ++i)
760             {
761                 // Collapse matrices
762                 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
763                 _collapseFourMatrices(
764                     m00, m01, m02,
765                     m10, m11, m12,
766                     m20, m21, m22,
767                     m30, m31, m32,
768                     pBlendWeight, pBlendIndex,
769                     blendMatrices,
770                     blendWeightStride, blendIndexStride,
771                     numWeightsPerVertex);
772 
773                 // Advance 4 vertices
774                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
775                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
776 
777                 //------------------------------------------------------------------
778                 // Transform positions
779                 //------------------------------------------------------------------
780 
781                 XMVECTOR s0, s1, s2, d0, d1, d2;
782 
783                 // Load source positions
784                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
785                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
786                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
787 
788                 // Arrange to 3x4 component-major for batches calculate
789                 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
790 
791                 // Transform by collapsed matrix
792 
793                 // Shuffle row 0 of four collapsed matrices for calculate X component
794                 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
795 
796                 // Transform X components
797                 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
798 
799                 // Shuffle row 1 of four collapsed matrices for calculate Y component
800                 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
801 
802                 // Transform Y components
803                 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
804 
805                 // Shuffle row 2 of four collapsed matrices for calculate Z component
806                 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
807 
808                 // Transform Z components
809                 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
810 
811                 // Arrange back to 4x3 continuous format for store results
812                 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
813 
814                 // Store blended positions
815                 DestPosAccessor::store(pDestPos + 0, d0);
816                 DestPosAccessor::store(pDestPos + 4, d1);
817                 DestPosAccessor::store(pDestPos + 8, d2);
818 
819                 // Advance 4 vertices
820                 pSrcPos += 4 * 3;
821                 pDestPos += 4 * 3;
822 
823                 //------------------------------------------------------------------
824                 // Transform normals
825                 //------------------------------------------------------------------
826 
827                 // Load source normals
828                 s0 = SrcNormAccessor::load(pSrcNorm + 0);               // x0 y0 z0 x1
829                 s1 = SrcNormAccessor::load(pSrcNorm + 4);               // y1 z1 x2 y2
830                 s2 = SrcNormAccessor::load(pSrcNorm + 8);               // z2 x3 y3 z3
831 
832                 // Arrange to 3x4 component-major for batches calculate
833                 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
834 
835                 // Transform by collapsed and shuffled matrices
836                 d0 = __DX_DOT3x3_PS(m00, m10, m20, s0, s1, s2);         // X0 X1 X2 X3
837                 d1 = __DX_DOT3x3_PS(m01, m11, m21, s0, s1, s2);         // Y0 Y1 Y2 Y3
838                 d2 = __DX_DOT3x3_PS(m02, m12, m22, s0, s1, s2);         // Z0 Z1 Z2 Z3
839 
840                 // Normalise normals
841                 XMVECTOR tmp = __DX_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
842                 tmp = XMVectorReciprocalSqrtEst(tmp);
843                 d0 = XMVectorMultiply(d0, tmp);
844                 d1 = XMVectorMultiply(d1, tmp);
845                 d2 = XMVectorMultiply(d2, tmp);
846 
847                 // Arrange back to 4x3 continuous format for store results
848                 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
849 
850                 // Store blended normals
851                 DestNormAccessor::store(pDestNorm + 0, d0);
852                 DestNormAccessor::store(pDestNorm + 4, d1);
853                 DestNormAccessor::store(pDestNorm + 8, d2);
854 
855                 // Advance 4 vertices
856                 pSrcNorm += 4 * 3;
857                 pDestNorm += 4 * 3;
858             }
859         }
860     };
softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)861     static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(
862         const float* pSrcPos, float* pDestPos,
863         const float* pSrcNorm, float* pDestNorm,
864         const float* pBlendWeight, const unsigned char* pBlendIndex,
865         const Matrix4* const* blendMatrices,
866         size_t blendWeightStride, size_t blendIndexStride,
867         size_t numWeightsPerVertex,
868         size_t numIterations)
869     {
870         assert(_isAlignedForDirectXMath(pSrcPos));
871 
872         // Instantiating two version only, since other alignement combination not that important.
873         if (_isAlignedForDirectXMath(pSrcNorm) && _isAlignedForDirectXMath(pDestPos) && _isAlignedForDirectXMath(pDestNorm))
874         {
875             SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed<true, true, true, true>::apply(
876                 pSrcPos, pDestPos,
877                 pSrcNorm, pDestNorm,
878                 pBlendWeight, pBlendIndex,
879                 blendMatrices,
880                 blendWeightStride, blendIndexStride,
881                 numWeightsPerVertex,
882                 numIterations);
883         }
884         else
885         {
886             SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed<true, false, false, false>::apply(
887                 pSrcPos, pDestPos,
888                 pSrcNorm, pDestNorm,
889                 pBlendWeight, pBlendIndex,
890                 blendMatrices,
891                 blendWeightStride, blendIndexStride,
892                 numWeightsPerVertex,
893                 numIterations);
894         }
895     }
896     //---------------------------------------------------------------------
897     // Special DirectXMath version skinning position only, the position buffer are
898     // packed.
899     template <bool srcPosAligned, bool destPosAligned>
900     struct SoftwareVertexSkinning_DirectXMath_PosOnly_Packed
901     {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosOnly_Packed902         static void apply(
903             const float* pSrcPos, float* pDestPos,
904             const float* pBlendWeight, const unsigned char* pBlendIndex,
905             const Matrix4* const* blendMatrices,
906             size_t blendWeightStride, size_t blendIndexStride,
907             size_t numWeightsPerVertex,
908             size_t numIterations)
909         {
910             typedef DirectXMathMemoryAccessor<srcPosAligned> SrcPosAccessor;
911             typedef DirectXMathMemoryAccessor<destPosAligned> DestPosAccessor;
912 
913             // Blending 4 vertices per-iteration
914             for (size_t i = 0; i < numIterations; ++i)
915             {
916                 // Collapse matrices
917                 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
918                 _collapseFourMatrices(
919                     m00, m01, m02,
920                     m10, m11, m12,
921                     m20, m21, m22,
922                     m30, m31, m32,
923                     pBlendWeight, pBlendIndex,
924                     blendMatrices,
925                     blendWeightStride, blendIndexStride,
926                     numWeightsPerVertex);
927 
928                 // Advance 4 vertices
929                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
930                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
931 
932                 //------------------------------------------------------------------
933                 // Transform positions
934                 //------------------------------------------------------------------
935 
936                 XMVECTOR s0, s1, s2, d0, d1, d2;
937 
938                 // Load source positions
939                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
940                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
941                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
942 
943                 // Arrange to 3x4 component-major for batches calculate
944                 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
945 
946                 // Transform by collapsed matrix
947 
948                 // Shuffle row 0 of four collapsed matrices for calculate X component
949                 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
950 
951                 // Transform X components
952                 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
953 
954                 // Shuffle row 1 of four collapsed matrices for calculate Y component
955                 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
956 
957                 // Transform Y components
958                 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
959 
960                 // Shuffle row 2 of four collapsed matrices for calculate Z component
961                 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
962 
963                 // Transform Z components
964                 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
965 
966                 // Arrange back to 4x3 continuous format for store results
967                 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
968 
969                 // Store blended positions
970                 DestPosAccessor::store(pDestPos + 0, d0);
971                 DestPosAccessor::store(pDestPos + 4, d1);
972                 DestPosAccessor::store(pDestPos + 8, d2);
973 
974                 // Advance 4 vertices
975                 pSrcPos += 4 * 3;
976                 pDestPos += 4 * 3;
977             }
978         }
979     };
softwareVertexSkinning_DirectXMath_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)980     static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosOnly_Packed(
981         const float* pSrcPos, float* pDestPos,
982         const float* pBlendWeight, const unsigned char* pBlendIndex,
983         const Matrix4* const* blendMatrices,
984         size_t blendWeightStride, size_t blendIndexStride,
985         size_t numWeightsPerVertex,
986         size_t numIterations)
987     {
988         assert(_isAlignedForDirectXMath(pSrcPos));
989 
990         // Instantiating two version only, since other alignement combination not that important.
991         if (_isAlignedForDirectXMath(pDestPos))
992         {
993             SoftwareVertexSkinning_DirectXMath_PosOnly_Packed<true, true>::apply(
994                 pSrcPos, pDestPos,
995                 pBlendWeight, pBlendIndex,
996                 blendMatrices,
997                 blendWeightStride, blendIndexStride,
998                 numWeightsPerVertex,
999                 numIterations);
1000         }
1001         else
1002         {
1003             SoftwareVertexSkinning_DirectXMath_PosOnly_Packed<true, false>::apply(
1004                 pSrcPos, pDestPos,
1005                 pBlendWeight, pBlendIndex,
1006                 blendMatrices,
1007                 blendWeightStride, blendIndexStride,
1008                 numWeightsPerVertex,
1009                 numIterations);
1010         }
1011     }
1012     //---------------------------------------------------------------------
softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1013     void OptimisedUtilDirectXMath::softwareVertexSkinning(
1014         const float *pSrcPos, float *pDestPos,
1015         const float *pSrcNorm, float *pDestNorm,
1016         const float *pBlendWeight, const unsigned char* pBlendIndex,
1017         const Matrix4* const* blendMatrices,
1018         size_t srcPosStride, size_t destPosStride,
1019         size_t srcNormStride, size_t destNormStride,
1020         size_t blendWeightStride, size_t blendIndexStride,
1021         size_t numWeightsPerVertex,
1022         size_t numVertices)
1023     {
1024         // All position/normal pointers should be perfect aligned, but still check here
1025         // for avoid hardware buffer which allocated by potential buggy driver doesn't
1026         // support alignment properly.
1027         // Because we are used meta-function technique here, the code is easy to maintenance
1028         // and still provides all possible alignment combination.
1029         //
1030 
1031         // Use unrolled routines only if there a lot of vertices
1032         if (numVertices > OGRE_DIRECTXMATH_SKINNING_UNROLL_VERTICES)
1033         {
1034             if (pSrcNorm)
1035             {
1036                 // Blend position and normal
1037 
1038                 if (srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1039                     pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1040                 {
1041                     // Position and normal are sharing with packed buffer
1042 
1043                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1044                     assert((srcPosAlign & 3) == 0);
1045 
1046                     // Blend unaligned vertices with general SIMD routine
1047                     if (srcPosAlign == 8)   // Because 8 bytes alignment shift per-vertex
1048                     {
1049                         size_t count = srcPosAlign / 8;
1050                         numVertices -= count;
1051                         softwareVertexSkinning_DirectXMath_General(
1052                             pSrcPos, pDestPos,
1053                             pSrcNorm, pDestNorm,
1054                             pBlendWeight, pBlendIndex,
1055                             blendMatrices,
1056                             srcPosStride, destPosStride,
1057                             srcNormStride, destNormStride,
1058                             blendWeightStride, blendIndexStride,
1059                             numWeightsPerVertex,
1060                             count);
1061 
1062                         pSrcPos += count * (3 + 3);
1063                         pDestPos += count * (3 + 3);
1064                         pSrcNorm += count * (3 + 3);
1065                         pDestNorm += count * (3 + 3);
1066                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1067                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1068                     }
1069 
1070                     // Blend vertices, four vertices per-iteration
1071                     size_t numIterations = numVertices / 4;
1072                     softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(
1073                         pSrcPos, pDestPos,
1074                         pBlendWeight, pBlendIndex,
1075                         blendMatrices,
1076                         blendWeightStride, blendIndexStride,
1077                         numWeightsPerVertex,
1078                         numIterations);
1079 
1080                     // Advance pointers for remaining vertices
1081                     numVertices &= 3;
1082                     if (numVertices)
1083                     {
1084                         pSrcPos += numIterations * 4 * (3 + 3);
1085                         pDestPos += numIterations * 4 * (3 + 3);
1086                         pSrcNorm += numIterations * 4 * (3 + 3);
1087                         pDestNorm += numIterations * 4 * (3 + 3);
1088                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1089                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1090                     }
1091                 }
1092                 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1093                          srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1094                 {
1095                     // Position and normal are separate buffers, and all of them are packed
1096 
1097                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1098                     assert((srcPosAlign & 3) == 0);
1099 
1100                     // Blend unaligned vertices with general SIMD routine
1101                     if (srcPosAlign)
1102                     {
1103                         size_t count = srcPosAlign / 4;
1104                         numVertices -= count;
1105                         softwareVertexSkinning_DirectXMath_General(
1106                             pSrcPos, pDestPos,
1107                             pSrcNorm, pDestNorm,
1108                             pBlendWeight, pBlendIndex,
1109                             blendMatrices,
1110                             srcPosStride, destPosStride,
1111                             srcNormStride, destNormStride,
1112                             blendWeightStride, blendIndexStride,
1113                             numWeightsPerVertex,
1114                             count);
1115 
1116                         pSrcPos += count * 3;
1117                         pDestPos += count * 3;
1118                         pSrcNorm += count * 3;
1119                         pDestNorm += count * 3;
1120                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1121                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1122                     }
1123 
1124                     // Blend vertices, four vertices per-iteration
1125                     size_t numIterations = numVertices / 4;
1126                     softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(
1127                         pSrcPos, pDestPos,
1128                         pSrcNorm, pDestNorm,
1129                         pBlendWeight, pBlendIndex,
1130                         blendMatrices,
1131                         blendWeightStride, blendIndexStride,
1132                         numWeightsPerVertex,
1133                         numIterations);
1134 
1135                     // Advance pointers for remaining vertices
1136                     numVertices &= 3;
1137                     if (numVertices)
1138                     {
1139                         pSrcPos += numIterations * 4 * 3;
1140                         pDestPos += numIterations * 4 * 3;
1141                         pSrcNorm += numIterations * 4 * 3;
1142                         pDestNorm += numIterations * 4 * 3;
1143                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1144                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1145                     }
1146                 }
1147                 else    // Not 'packed' form or wrong order between position and normal
1148                 {
1149                     // Should never occur, do nothing here just in case
1150                 }
1151             }
1152             else    // !pSrcNorm
1153             {
1154                 // Blend position only
1155 
1156                 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1157                 {
1158                     // All buffers are packed
1159 
1160                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1161                     assert((srcPosAlign & 3) == 0);
1162 
1163                     // Blend unaligned vertices with general SIMD routine
1164                     if (srcPosAlign)
1165                     {
1166                         size_t count = srcPosAlign / 4;
1167                         numVertices -= count;
1168                         softwareVertexSkinning_DirectXMath_General(
1169                             pSrcPos, pDestPos,
1170                             pSrcNorm, pDestNorm,
1171                             pBlendWeight, pBlendIndex,
1172                             blendMatrices,
1173                             srcPosStride, destPosStride,
1174                             srcNormStride, destNormStride,
1175                             blendWeightStride, blendIndexStride,
1176                             numWeightsPerVertex,
1177                             count);
1178 
1179                         pSrcPos += count * 3;
1180                         pDestPos += count * 3;
1181                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1182                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1183                     }
1184 
1185                     // Blend vertices, four vertices per-iteration
1186                     size_t numIterations = numVertices / 4;
1187                     softwareVertexSkinning_DirectXMath_PosOnly_Packed(
1188                         pSrcPos, pDestPos,
1189                         pBlendWeight, pBlendIndex,
1190                         blendMatrices,
1191                         blendWeightStride, blendIndexStride,
1192                         numWeightsPerVertex,
1193                         numIterations);
1194 
1195                     // Advance pointers for remaining vertices
1196                     numVertices &= 3;
1197                     if (numVertices)
1198                     {
1199                         pSrcPos += numIterations * 4 * 3;
1200                         pDestPos += numIterations * 4 * 3;
1201                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1202                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1203                     }
1204                 }
1205                 else    // Not 'packed' form
1206                 {
1207                     // Might occur only if user forced software blending position only
1208                 }
1209             }
1210         }
1211 
1212         // Blend remaining vertices, need to do it with SIMD for identical result,
1213         // since mixing general floating-point and SIMD algorithm will causing
1214         // floating-point error.
1215         if (numVertices)
1216         {
1217             softwareVertexSkinning_DirectXMath_General(
1218                 pSrcPos, pDestPos,
1219                 pSrcNorm, pDestNorm,
1220                 pBlendWeight, pBlendIndex,
1221                 blendMatrices,
1222                 srcPosStride, destPosStride,
1223                 srcNormStride, destNormStride,
1224                 blendWeightStride, blendIndexStride,
1225                 numWeightsPerVertex,
1226                 numVertices);
1227         }
1228     }
1229     //---------------------------------------------------------------------
softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1230     void OptimisedUtilDirectXMath::softwareVertexMorph(
1231         Real t,
1232         const float *pSrc1, const float *pSrc2,
1233         float *pDst,
1234 		size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
1235         size_t numVertices,
1236 		bool morphNormals)
1237     {
1238         XMVECTOR src01, src02, src11, src12, src21, src22;
1239         XMVECTOR dst0, dst1, dst2;
1240 
1241         XMVECTOR t4 = XMVectorReplicate(t);
1242 
1243 
1244 		// If we're morphing normals, we have twice the number of floats to process
1245 		// Positions are interleaved with normals, so we'll have to separately
1246 		// normalise just the normals later; we'll just lerp in the first pass
1247 		// We can't normalise as we go because normals & positions are only 3 floats
1248 		// each so are not aligned for DirectXMath, we'd mix the data up
1249 		size_t normalsMultiplier = morphNormals ? 2 : 1;
1250         size_t numIterations = (numVertices*normalsMultiplier) / 4;
1251 		size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3;
1252 
1253 		// Save for later
1254 		float *pStartDst = pDst;
1255 
1256         // Never use meta-function technique to accessing memory because looks like
1257         // VC7.1 generate a bit inefficient binary code when put following code into
1258         // inline function.
1259 
1260         if (_isAlignedForDirectXMath(pSrc1) && _isAlignedForDirectXMath(pSrc2) && _isAlignedForDirectXMath(pDst))
1261         {
1262             // All data aligned
1263 
1264             // Morph 4 vertices per-iteration. Special designed for use all
1265             // available CPU registers as possible (7 registers used here),
1266             // and avoid temporary values allocated in stack for suppress
1267             // extra memory access.
1268             for (size_t i = 0; i < numIterations; ++i)
1269             {
1270                 // 12 floating-point values
1271                 src01 = __DX_LOAD_PS(pSrc1 + 0);
1272                 src02 = __DX_LOAD_PS(pSrc2 + 0);
1273                 src11 = __DX_LOAD_PS(pSrc1 + 4);
1274                 src12 = __DX_LOAD_PS(pSrc2 + 4);
1275                 src21 = __DX_LOAD_PS(pSrc1 + 8);
1276                 src22 = __DX_LOAD_PS(pSrc2 + 8);
1277                 pSrc1 += 12; pSrc2 += 12;
1278 
1279                 dst0 = __DX_LERP_PS(t4, src01, src02);
1280                 dst1 = __DX_LERP_PS(t4, src11, src12);
1281                 dst2 = __DX_LERP_PS(t4, src21, src22);
1282 
1283                 __DX_STORE_PS(pDst + 0, dst0);
1284                 __DX_STORE_PS(pDst + 4, dst1);
1285                 __DX_STORE_PS(pDst + 8, dst2);
1286                 pDst += 12;
1287             }
1288 
1289             // Morph remaining vertices
1290             switch (numVerticesRemainder)
1291             {
1292             case 3:
1293                 // 9 floating-point values
1294                 src01 = __DX_LOAD_PS(pSrc1 + 0);
1295                 src02 = __DX_LOAD_PS(pSrc2 + 0);
1296                 src11 = __DX_LOAD_PS(pSrc1 + 4);
1297                 src12 = __DX_LOAD_PS(pSrc2 + 4);
1298                 src21 = XMLoadFloat(pSrc1 + 8);
1299                 src22 = XMLoadFloat(pSrc2 + 8);
1300 
1301                 dst0 = __DX_LERP_PS(t4, src01, src02);
1302                 dst1 = __DX_LERP_PS(t4, src11, src12);
1303                 dst2 = __DX_LERP_SS(t4, src21, src22);
1304 
1305                 __DX_STORE_PS(pDst + 0, dst0);
1306                 __DX_STORE_PS(pDst + 4, dst1);
1307                 XMStoreFloat(pDst + 8, dst2);
1308                 break;
1309 
1310             case 2:
1311                 // 6 floating-point values
1312                 src01 = __DX_LOAD_PS(pSrc1 + 0);
1313                 src02 = __DX_LOAD_PS(pSrc2 + 0);
1314                 src11 = XMLoadFloat2((XMFLOAT2*)(pSrc1 + 4));
1315                 src12 = XMLoadFloat2((XMFLOAT2*)(pSrc2 + 4));
1316 
1317                 dst0 = __DX_LERP_PS(t4, src01, src02);
1318                 dst1 = __DX_LERP_PS(t4, src11, src12);
1319 
1320                 __DX_STORE_PS(pDst + 0, dst0);
1321                 XMStoreFloat2((XMFLOAT2*)(pDst + 4), dst1);
1322                 break;
1323 
1324             case 1:
1325                 // 3 floating-point values
1326                 src01 = XMLoadFloat3((XMFLOAT3*)(pSrc1 + 0));
1327                 src02 = XMLoadFloat3((XMFLOAT3*)(pSrc2 + 0));
1328 
1329                 dst0 = __DX_LERP_PS(t4, src01, src02);
1330 
1331                 XMStoreFloat3((XMFLOAT3*)(pDst + 0), dst0);
1332                 break;
1333             }
1334         }
1335         else    // Should never occur, just in case of buggy drivers
1336         {
1337             // Assume all data unaligned
1338 
1339             // Morph 4 vertices per-iteration. Special designed for use all
1340             // available CPU registers as possible (7 registers used here),
1341             // and avoid temporary values allocated in stack for suppress
1342             // extra memory access.
1343             for (size_t i = 0; i < numIterations; ++i)
1344             {
1345                 // 12 floating-point values
1346                 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1347                 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1348                 src11 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 4));
1349                 src12 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 4));
1350                 src21 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 8));
1351                 src22 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 8));
1352                 pSrc1 += 12; pSrc2 += 12;
1353 
1354                 dst0 = __DX_LERP_PS(t4, src01, src02);
1355                 dst1 = __DX_LERP_PS(t4, src11, src12);
1356                 dst2 = __DX_LERP_PS(t4, src21, src22);
1357 
1358                 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1359                 XMStoreFloat4((XMFLOAT4*)(pDst + 4), dst1);
1360                 XMStoreFloat4((XMFLOAT4*)(pDst + 8), dst2);
1361                 pDst += 12;
1362             }
1363 
1364             // Morph remaining vertices
1365             switch (numVerticesRemainder)
1366             {
1367             case 3:
1368                 // 9 floating-point values
1369                 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1370                 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1371                 src11 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 4));
1372                 src12 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 4));
1373                 src21 = XMLoadFloat(pSrc1 + 8);
1374                 src22 = XMLoadFloat(pSrc2 + 8);
1375 
1376                 dst0 = __DX_LERP_PS(t4, src01, src02);
1377                 dst1 = __DX_LERP_PS(t4, src11, src12);
1378                 dst2 = __DX_LERP_SS(t4, src21, src22);
1379 
1380                 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1381                 XMStoreFloat4((XMFLOAT4*)(pDst + 4), dst1);
1382                 XMStoreFloat(pDst + 8, dst2);
1383                 break;
1384 
1385             case 2:
1386                 // 6 floating-point values
1387                 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1388                 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1389                 src11 = XMLoadFloat2((XMFLOAT2*)(pSrc1 + 4));
1390                 src12 = XMLoadFloat2((XMFLOAT2*)(pSrc2 + 4));
1391 
1392                 dst0 = __DX_LERP_PS(t4, src01, src02);
1393                 dst1 = __DX_LERP_PS(t4, src11, src12);
1394 
1395                 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1396                 XMStoreFloat2((XMFLOAT2*)(pDst + 4), dst1);
1397                 break;
1398 
1399             case 1:
1400                 // 3 floating-point values
1401                 src01 = XMLoadFloat3((XMFLOAT3*)(pSrc1 + 0));
1402                 src02 = XMLoadFloat3((XMFLOAT3*)(pSrc2 + 0));
1403 
1404                 dst0 = __DX_LERP_PS(t4, src01, src02);
1405 
1406                 XMStoreFloat3((XMFLOAT3*)(pDst + 0), dst0);
1407                 break;
1408             }
1409 
1410         }
1411 
1412 		if (morphNormals)
1413 		{
1414 
1415 			// Now we need to do and unaligned normalise on the normals data we just
1416 			// lerped; because normals are 3 elements each they're always unaligned
1417 			float *pNorm = pStartDst;
1418 
1419 			// Offset past first position
1420 			pNorm += 3;
1421 
1422 			// We'll do one normal each iteration, but still use DirectXMath
1423 			for (size_t n = 0; n < numVertices; ++n)
1424 			{
1425 				// normalise function
1426 				XMVECTOR norm;
1427 
1428 				// load 3 floating-point normal values
1429                 norm = XMLoadFloat3((XMFLOAT3*)pNorm);
1430                 norm = XMVector3Normalize(norm);
1431 
1432 				// Store back in the same place
1433                 XMStoreFloat3((XMFLOAT3*)pNorm, norm);
1434 
1435 				// Skip to next vertex (3x normal components, 3x position components)
1436 				pNorm += 6;
1437 			}
1438 		}
1439     }
1440     //---------------------------------------------------------------------
concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * pSrcMat,Matrix4 * pDstMat,size_t numMatrices)1441     void OptimisedUtilDirectXMath::concatenateAffineMatrices(
1442         const Matrix4& baseMatrix,
1443         const Matrix4* pSrcMat,
1444         Matrix4* pDstMat,
1445         size_t numMatrices)
1446     {
1447         assert(_isAlignedForDirectXMath(pSrcMat));
1448         assert(_isAlignedForDirectXMath(pDstMat));
1449 
1450         // Load base matrix, unaligned
1451         XMVECTOR m0 = XMLoadFloat4((XMFLOAT4*)baseMatrix[0]);
1452         XMVECTOR m1 = XMLoadFloat4((XMFLOAT4*)baseMatrix[1]);
1453         XMVECTOR m2 = XMLoadFloat4((XMFLOAT4*)baseMatrix[2]);
1454         XMVECTOR m3 = XMLoadFloat4((XMFLOAT4*)baseMatrix[3]);        // m3 should be equal to (0, 0, 0, 1)
1455 
1456         for (size_t i = 0; i < numMatrices; ++i)
1457         {
1458             // Load source matrix, aligned
1459             XMVECTOR s0 = __DX_LOAD_PS((*pSrcMat)[0]);
1460             XMVECTOR s1 = __DX_LOAD_PS((*pSrcMat)[1]);
1461             XMVECTOR s2 = __DX_LOAD_PS((*pSrcMat)[2]);
1462 
1463             ++pSrcMat;
1464 
1465             XMVECTOR t0, t1, t2, t3;
1466 
1467             // Concatenate matrix, and store results
1468 
1469             // Row 0
1470             t0 = XMVectorMultiply(XMVectorSplatX(m0), s0);
1471             t1 = XMVectorMultiply(XMVectorSplatY(m0), s1);
1472             t2 = XMVectorMultiply(XMVectorSplatZ(m0), s2);
1473             t3 = XMVectorMultiply(m0, m3);    // Compiler should optimise this out of the loop
1474             __DX_STORE_PS((*pDstMat)[0], __DX_ACCUM4_PS(t0,t1,t2,t3));
1475 
1476             // Row 1
1477             t0 = XMVectorMultiply(XMVectorSplatX(m1), s0);
1478             t1 = XMVectorMultiply(XMVectorSplatY(m1), s1);
1479             t2 = XMVectorMultiply(XMVectorSplatZ(m1), s2);
1480             t3 = XMVectorMultiply(m1, m3);    // Compiler should optimise this out of the loop
1481             __DX_STORE_PS((*pDstMat)[1], __DX_ACCUM4_PS(t0,t1,t2,t3));
1482 
1483             // Row 2
1484             t0 = XMVectorMultiply(XMVectorSplatX(m2), s0);
1485             t1 = XMVectorMultiply(XMVectorSplatY(m2), s1);
1486             t2 = XMVectorMultiply(XMVectorSplatZ(m2), s2);
1487             t3 = XMVectorMultiply(m2, m3);    // Compiler should optimise this out of the loop
1488             __DX_STORE_PS((*pDstMat)[2], __DX_ACCUM4_PS(t0,t1,t2,t3));
1489 
1490             // Row 3
1491             __DX_STORE_PS((*pDstMat)[3], m3);
1492 
1493             ++pDstMat;
1494         }
1495     }
1496     //---------------------------------------------------------------------
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1497     void OptimisedUtilDirectXMath::calculateFaceNormals(
1498         const float *positions,
1499         const EdgeData::Triangle *triangles,
1500         Vector4 *faceNormals,
1501         size_t numTriangles)
1502     {
1503         assert(_isAlignedForDirectXMath(faceNormals));
1504 
1505         size_t numIterations = numTriangles / 4;
1506         numTriangles &= 3;
1507 
1508         // Four triangles per-iteration
1509         for (size_t i = 0; i < numIterations; ++i)
1510         {
1511 
1512 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1513 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3)                            \
1514             {                                                                   \
1515                 XMVECTOR v0 = XMLoadFloat3((XMFLOAT3*)(p0)); /* x0 y0 z0 -- */  \
1516                 XMVECTOR v1 = XMLoadFloat3((XMFLOAT3*)(p1)); /* x1 y1 z1 -- */  \
1517                 XMVECTOR v2 = XMLoadFloat3((XMFLOAT3*)(p2)); /* x2 y2 z2 -- */  \
1518                 XMVECTOR v3 = XMLoadFloat3((XMFLOAT3*)(p3)); /* x3 y3 z3 -- */  \
1519                 XMVECTOR t0, t1;                                                \
1520                                                                                 \
1521                 t0 = XMVectorMergeXY(v0, v2);       /* x0 x2 y0 y2 */           \
1522                 t1 = XMVectorMergeXY(v1, v3);       /* x1 x3 y1 y3 */           \
1523                 x = XMVectorMergeXY(t0, t1);        /* x0 x1 x2 x3 */           \
1524                 y = XMVectorMergeZW(t0, t1);        /* y0 y1 y2 y3 */           \
1525                                                                                 \
1526                 t0 = XMVectorMergeZW(v0, v2);       /* z0 z2 -- -- */           \
1527                 t1 = XMVectorMergeZW(v1, v2);       /* z1 z3 -- -- */           \
1528                 z = XMVectorMergeXY(t0, t1);        /* z0 z1 z2 z3 */           \
1529             }
1530 
1531             XMVECTOR x0, x1, x2, y0, y1, y2, z0, z1, z2;
1532 
1533             // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1534             __LOAD_FOUR_VECTOR3(x0, y0, z0,
1535                 positions + triangles[0].vertIndex[0] * 3,
1536                 positions + triangles[1].vertIndex[0] * 3,
1537                 positions + triangles[2].vertIndex[0] * 3,
1538                 positions + triangles[3].vertIndex[0] * 3);
1539 
1540             // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1541             __LOAD_FOUR_VECTOR3(x1, y1, z1,
1542                 positions + triangles[0].vertIndex[1] * 3,
1543                 positions + triangles[1].vertIndex[1] * 3,
1544                 positions + triangles[2].vertIndex[1] * 3,
1545                 positions + triangles[3].vertIndex[1] * 3);
1546 
1547             // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1548             __LOAD_FOUR_VECTOR3(x2, y2, z2,
1549                 positions + triangles[0].vertIndex[2] * 3,
1550                 positions + triangles[1].vertIndex[2] * 3,
1551                 positions + triangles[2].vertIndex[2] * 3,
1552                 positions + triangles[3].vertIndex[2] * 3);
1553 
1554             triangles += 4;
1555 
1556             // Calculate triangle face normals
1557 
1558             // a = v1 - v0
1559             XMVECTOR ax = XMVectorSubtract(x1, x0);
1560             XMVECTOR ay = XMVectorSubtract(y1, y0);
1561             XMVECTOR az = XMVectorSubtract(z1, z0);
1562 
1563             // b = v2 - v0
1564             XMVECTOR bx = XMVectorSubtract(x2, x0);
1565             XMVECTOR by = XMVectorSubtract(y2, y0);
1566             XMVECTOR bz = XMVectorSubtract(z2, z0);
1567 
1568             // n = a cross b
1569             XMVECTOR nx = XMVectorSubtract(XMVectorMultiply(ay, bz), XMVectorMultiply(az, by));
1570             XMVECTOR ny = XMVectorSubtract(XMVectorMultiply(az, bx), XMVectorMultiply(ax, bz));
1571             XMVECTOR nz = XMVectorSubtract(XMVectorMultiply(ax, by), XMVectorMultiply(ay, bx));
1572 
1573             // w = - (n dot v0)
1574             XMVECTOR nw = XMVectorNegate(__DX_DOT3x3_PS(nx, ny, nz, x0, y0, z0));
1575 
1576             // Arrange to per-triangle face normal major format
1577             __DX_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1578 
1579             // Store results
1580             __DX_STORE_PS(&faceNormals[0].x, nx);
1581             __DX_STORE_PS(&faceNormals[1].x, ny);
1582             __DX_STORE_PS(&faceNormals[2].x, nz);
1583             __DX_STORE_PS(&faceNormals[3].x, nw);
1584             faceNormals += 4;
1585         }
1586 
1587         // Dealing with remaining triangles
1588         for (size_t j = 0; j < numTriangles; ++j)
1589         {
1590             // Load vertices of the triangle
1591             XMVECTOR v0 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[0] * 3));
1592             XMVECTOR v1 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[1] * 3));
1593             XMVECTOR v2 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[2] * 3));
1594             ++triangles;
1595 
1596             // Calculate face normal
1597             XMVECTOR plane = XMPlaneFromPoints(v0, v1, v2);
1598 
1599             // Store result
1600             __DX_STORE_PS(&faceNormals->x, plane);
1601             ++faceNormals;
1602         }
1603     }
1604     //---------------------------------------------------------------------
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1605     void OptimisedUtilDirectXMath::calculateLightFacing(
1606         const Vector4& lightPos,
1607         const Vector4* faceNormals,
1608         char* lightFacings,
1609         size_t numFaces)
1610     {
1611         assert(_isAlignedForDirectXMath(faceNormals));
1612 
1613         // Map to convert 4-bits mask to 4 byte values
1614         static const char msMaskMapping[16][4] =
1615         {
1616             {0, 0, 0, 0},   {1, 0, 0, 0},   {0, 1, 0, 0},   {1, 1, 0, 0},
1617             {0, 0, 1, 0},   {1, 0, 1, 0},   {0, 1, 1, 0},   {1, 1, 1, 0},
1618             {0, 0, 0, 1},   {1, 0, 0, 1},   {0, 1, 0, 1},   {1, 1, 0, 1},
1619             {0, 0, 1, 1},   {1, 0, 1, 1},   {0, 1, 1, 1},   {1, 1, 1, 1},
1620         };
1621 
1622         XMVECTOR n0, n1, n2, n3;
1623         XMVECTOR t0, t1;
1624         XMVECTOR dp;
1625         int bitmask;
1626 
1627         // Load light vector, unaligned
1628         XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1629 
1630         size_t numIterations = numFaces / 4;
1631         numFaces &= 3;
1632 
1633         // Four faces per-iteration
1634         for (size_t i = 0; i < numIterations; ++i)
1635         {
1636             // Load face normals, aligned
1637             n0 = __DX_LOAD_PS(&faceNormals[0].x);
1638             n1 = __DX_LOAD_PS(&faceNormals[1].x);
1639             n2 = __DX_LOAD_PS(&faceNormals[2].x);
1640             n3 = __DX_LOAD_PS(&faceNormals[3].x);
1641             faceNormals += 4;
1642 
1643             // Multiply by light vector
1644             n0 = XMVectorMultiply(n0, lp);        // x0 y0 z0 w0
1645             n1 = XMVectorMultiply(n1, lp);        // x1 y1 z1 w1
1646             n2 = XMVectorMultiply(n2, lp);        // x2 y2 z2 w2
1647             n3 = XMVectorMultiply(n3, lp);        // x3 y3 z3 w3
1648 
1649             // Horizontal add four vector values.
1650             t0 = XMVectorAdd(                                           // x0+z0 x1+z1 y0+w0 y1+w1
1651                 XMVectorMergeXY(n0, n1),    // x0 x1 y0 y1
1652                 XMVectorMergeZW(n0, n1));   // z0 z1 w0 w1
1653             t1 = XMVectorAdd(                                           // x2+z2 x3+z3 y2+w2 y3+w3
1654                 XMVectorMergeXY(n2, n3),    // x2 x3 y2 y3
1655                 XMVectorMergeZW(n2, n3));   // z2 z3 w2 w3
1656             dp = XMVectorAdd(                                           // dp0 dp1 dp2 dp3
1657                 XMVectorPermute<0, 1, 4, 5>(t0, t1),    // x0+z0 x1+z1 x2+z2 x3+z3
1658                 XMVectorPermute<6, 7, 2, 3>(t1, t0));   // y0+w0 y1+w1 y2+w2 y3+w3
1659 
1660             bitmask = XMVector4GreaterR(dp, g_XMZero);
1661 
1662             // Convert 4-bits mask to 4 bytes, and store results.
1663             *reinterpret_cast<uint32*>(lightFacings) =
1664                 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]);
1665             lightFacings += 4;
1666         }
1667 
1668         // Dealing with remaining faces
1669         switch (numFaces)
1670         {
1671         case 3:
1672             n0 = __DX_LOAD_PS(&faceNormals[0].x);
1673             n1 = __DX_LOAD_PS(&faceNormals[1].x);
1674             n2 = __DX_LOAD_PS(&faceNormals[2].x);
1675 
1676             n0 = XMVectorMultiply(n0, lp);        // x0 y0 z0 w0
1677             n1 = XMVectorMultiply(n1, lp);        // x1 y1 z1 w1
1678             n2 = XMVectorMultiply(n2, lp);        // x2 y2 z2 w2
1679 
1680             t0 = XMVectorAdd(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1681                 XMVectorMergeXY(n0, n1),    // x0 x1 y0 y1
1682                 XMVectorMergeZW(n0, n1));   // z0 z1 w0 w1
1683             t1 = XMVectorAdd(                                            // x2+z2 x2+z2 y2+w2 y2+w2
1684                 XMVectorMergeXY(n2, n2),    // x2 x2 y2 y2
1685                 XMVectorMergeZW(n2, n2));   // z2 z2 w2 w2
1686             dp = XMVectorAdd(                                            // dp0 dp1 dp2 dp2
1687                 XMVectorPermute<0, 1, 4, 5>(t0, t1),    // x0+z0 x1+z1 x2+z2 x2+z2
1688                 XMVectorPermute<6, 7, 2, 3>(t1, t0));   // y0+w0 y1+w1 y2+w2 y2+w2
1689 
1690             bitmask = XMVector4GreaterR(dp, g_XMZero);
1691 
1692             lightFacings[0] = msMaskMapping[bitmask][0];
1693             lightFacings[1] = msMaskMapping[bitmask][1];
1694             lightFacings[2] = msMaskMapping[bitmask][2];
1695             break;
1696 
1697         case 2:
1698             n0 = __DX_LOAD_PS(&faceNormals[0].x);
1699             n1 = __DX_LOAD_PS(&faceNormals[1].x);
1700 
1701             n0 = XMVectorMultiply(n0, lp);        // x0 y0 z0 w0
1702             n1 = XMVectorMultiply(n1, lp);        // x1 y1 z1 w1
1703 
1704             t0 = XMVectorAdd(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1705                 XMVectorMergeXY(n0, n1),    // x0 x1 y0 y1
1706                 XMVectorMergeZW(n0, n1));   // z0 z1 w0 w1
1707             dp = XMVectorAdd(                                            // dp0 dp1 dp0 dp1
1708                 XMVectorSwizzle<0, 1, 0, 1>(t0),        // x0+z0 x1+z1 x0+z0 x1+z1
1709                 XMVectorSwizzle<2, 3, 2, 3>(t0));   // y0+w0 y1+w1 y0+w0 y1+w1
1710 
1711             bitmask = XMVector4GreaterR(dp, g_XMZero);
1712 
1713             lightFacings[0] = msMaskMapping[bitmask][0];
1714             lightFacings[1] = msMaskMapping[bitmask][1];
1715             break;
1716 
1717         case 1:
1718             n0 = __DX_LOAD_PS(&faceNormals[0].x);
1719 
1720             n0 = XMVectorMultiply(n0, lp);        // x0 y0 z0 w0
1721 
1722             t0 = XMVectorAdd(                                            // x0+z0 x0+z0 y0+w0 y0+w0
1723                 XMVectorMergeXY(n0, n0),    // x0 x0 y0 y0
1724                 XMVectorMergeZW(n0, n0));   // z0 z0 w0 w0
1725             dp = XMVectorAdd(                                            // dp0 dp0 dp0 dp0
1726                 XMVectorSplatX(t0),      // x0+z0 x0+z0 x0+z0 x0+z0
1727                 XMVectorSplatZ(t0));     // y0+w0 y0+w0 y0+w0 y0+w0
1728 
1729             bitmask = XMVector4GreaterR(dp, g_XMZero);
1730 
1731             lightFacings[0] = msMaskMapping[bitmask][0];
1732             break;
1733         }
1734     }
1735     //---------------------------------------------------------------------
1736     // Template to extrude vertices for directional light.
1737     template <bool srcAligned, bool destAligned>
1738     struct ExtrudeVertices_DirectXMath_DirectionalLight
1739     {
applyOgre::ExtrudeVertices_DirectXMath_DirectionalLight1740         static void apply(
1741             const Vector4& lightPos,
1742             Real extrudeDist,
1743             const float* pSrcPos,
1744             float* pDestPos,
1745             size_t numVertices)
1746         {
1747             typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
1748             typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
1749 
1750             // Directional light, extrusion is along light direction
1751 
1752             // Load light vector, unaligned
1753             XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1754 
1755             // Calculate extrusion direction, note that we use inverted direction here
1756             // for eliminate an extra negative instruction, we'll compensate for that
1757             // by use subtract instruction instead later.
1758             XMVECTOR dir = XMVectorMultiply(      // X Y Z -
1759                 XMVector3NormalizeEst(lp),
1760                 XMVectorReplicate(extrudeDist));
1761 
1762             // Prepare extrude direction for extruding 4 vertices parallelly
1763             XMVECTOR dir0 = XMVectorSwizzle<0, 1, 2, 0>(dir);   // X Y Z X
1764             XMVECTOR dir1 = XMVectorSwizzle<1, 2, 0, 1>(dir);   // Y Z X Y
1765             XMVECTOR dir2 = XMVectorSwizzle<2, 0, 1, 2>(dir);   // Z X Y Z
1766 
1767             XMVECTOR s0, s1, s2;
1768             XMVECTOR d0, d1, d2;
1769 
1770             size_t numIterations = numVertices / 4;
1771             numVertices &= 3;
1772 
1773             // Extruding 4 vertices per-iteration
1774             for (size_t i = 0; i < numIterations; ++i)
1775             {
1776                 s0 = SrcAccessor::load(pSrcPos + 0);
1777                 s1 = SrcAccessor::load(pSrcPos + 4);
1778                 s2 = SrcAccessor::load(pSrcPos + 8);
1779                 pSrcPos += 12;
1780 
1781                 // The extrusion direction is inverted, use subtract instruction here
1782                 d0 = XMVectorSubtract(s0, dir0);    // X0 Y0 Z0 X1
1783                 d1 = XMVectorSubtract(s1, dir1);    // Y1 Z1 X2 Y2
1784                 d2 = XMVectorSubtract(s2, dir2);    // Z2 X3 Y3 Z3
1785 
1786                 DestAccessor::store(pDestPos + 0, d0);
1787                 DestAccessor::store(pDestPos + 4, d1);
1788                 DestAccessor::store(pDestPos + 8, d2);
1789                 pDestPos += 12;
1790             }
1791 
1792             // Dealing with remaining vertices
1793             switch (numVertices)
1794             {
1795             case 3:
1796                 // 9 floating-point values
1797                 s0 = SrcAccessor::load(pSrcPos + 0);
1798                 s1 = SrcAccessor::load(pSrcPos + 4);
1799                 s2 = XMLoadFloat(pSrcPos + 8);
1800 
1801                 // The extrusion direction is inverted, use subtract instruction here
1802                 d0 = XMVectorSubtract(s0, dir0);    // X0 Y0 Z0 X1
1803                 d1 = XMVectorSubtract(s1, dir1);    // Y1 Z1 X2 Y2
1804                 d2 = XMVectorSubtract(s2, dir2);    // Z2 -- -- --
1805 
1806                 DestAccessor::store(pDestPos + 0, d0);
1807                 DestAccessor::store(pDestPos + 4, d1);
1808                 XMStoreFloat(pDestPos + 8, d2);
1809                 break;
1810 
1811             case 2:
1812                 // 6 floating-point values
1813                 s0 = SrcAccessor::load(pSrcPos + 0);
1814                 s1 = XMLoadFloat2((XMFLOAT2*)(pSrcPos + 4));
1815 
1816                 // The extrusion direction is inverted, use subtract instruction here
1817                 d0 = XMVectorSubtract(s0, dir0);    // X0 Y0 Z0 X1
1818                 d1 = XMVectorSubtract(s1, dir1);    // Y1 Z1 -- --
1819 
1820                 DestAccessor::store(pDestPos + 0, d0);
1821                 XMStoreFloat2((XMFLOAT2*)(pDestPos + 4), d1);
1822                 break;
1823 
1824             case 1:
1825                 // 3 floating-point values
1826                 s0 = XMLoadFloat3((XMFLOAT3*)(pSrcPos + 0));
1827 
1828                 // The extrusion direction is inverted, use subtract instruction here
1829                 d0 = XMVectorSubtract(s0, dir0);    // X0 Y0 Z0 --
1830 
1831                 XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), d0);
1832                 break;
1833             }
1834         }
1835     };
1836     //---------------------------------------------------------------------
1837     // Template to extrude vertices for point light.
1838     template <bool srcAligned, bool destAligned>
1839     struct ExtrudeVertices_DirectXMath_PointLight
1840     {
applyOgre::ExtrudeVertices_DirectXMath_PointLight1841         static void apply(
1842             const Vector4& lightPos,
1843             Real extrudeDist,
1844             const float* pSrcPos,
1845             float* pDestPos,
1846             size_t numVertices)
1847         {
1848             typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
1849             typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
1850 
1851             // Point light, will calculate extrusion direction for every vertex
1852 
1853             // Load light vector, unaligned
1854             XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1855 
1856             // Load extrude distance
1857             XMVECTOR extrudeDist4 = XMVectorReplicate(extrudeDist);
1858 
1859             size_t numIterations = numVertices / 4;
1860             numVertices &= 3;
1861 
1862             // Extruding 4 vertices per-iteration
1863             for (size_t i = 0; i < numIterations; ++i)
1864             {
1865                 // Load source positions
1866                 XMVECTOR s0 = SrcAccessor::load(pSrcPos + 0);     // x0 y0 z0 x1
1867                 XMVECTOR s1 = SrcAccessor::load(pSrcPos + 4);     // y1 z1 x2 y2
1868                 XMVECTOR s2 = SrcAccessor::load(pSrcPos + 8);     // z2 x3 y3 z3
1869                 pSrcPos += 12;
1870 
1871                 // Arrange to 3x4 component-major for batches calculate
1872                 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
1873 
1874                 // Calculate unnormalised extrusion direction
1875                 XMVECTOR dx = XMVectorSubtract(s0, XMVectorSplatX(lp));     // X0 X1 X2 X3
1876                 XMVECTOR dy = XMVectorSubtract(s1, XMVectorSplatY(lp));     // Y0 Y1 Y2 Y3
1877                 XMVECTOR dz = XMVectorSubtract(s2, XMVectorSplatZ(lp));     // Z0 Z1 Z2 Z3
1878 
1879                 // Normalise extrusion direction and multiply by extrude distance
1880                 XMVECTOR tmp = __DX_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
1881                 tmp = XMVectorMultiply(XMVectorReciprocalSqrtEst(tmp), extrudeDist4);
1882                 dx = XMVectorMultiply(dx, tmp);
1883                 dy = XMVectorMultiply(dy, tmp);
1884                 dz = XMVectorMultiply(dz, tmp);
1885 
1886                 // Calculate extruded positions
1887                 XMVECTOR d0 = XMVectorAdd(dx, s0);
1888                 XMVECTOR d1 = XMVectorAdd(dy, s1);
1889                 XMVECTOR d2 = XMVectorAdd(dz, s2);
1890 
1891                 // Arrange back to 4x3 continuous format for store results
1892                 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
1893 
1894                 // Store extruded positions
1895                 DestAccessor::store(pDestPos + 0, d0);
1896                 DestAccessor::store(pDestPos + 4, d1);
1897                 DestAccessor::store(pDestPos + 8, d2);
1898                 pDestPos += 12;
1899             }
1900 
1901             // Dealing with remaining vertices
1902             for (size_t j = 0; j  < numVertices; ++j)
1903             {
1904                 // Load source position
1905                 XMVECTOR src = XMLoadFloat3((XMFLOAT3*)(pSrcPos + 0));  // x y z 0
1906                 pSrcPos += 3;
1907 
1908                 // Calculate unnormalised extrusion direction
1909                 XMVECTOR dir = XMVectorSubtract(src, lp); // X Y Z 0
1910 
1911                 // Normalise extrusion direction and multiply by extrude distance
1912                 dir = XMVectorMultiply(
1913                     XMVector3NormalizeEst(dir),
1914                     extrudeDist4);
1915 
1916                 // Calculate extruded position
1917                 XMVECTOR dst = XMVectorAdd(dir, src);
1918 
1919                 // Store extruded position
1920                 XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), dst);
1921                 pDestPos += 3;
1922             }
1923         }
1924     };
1925     //---------------------------------------------------------------------
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)1926     void OptimisedUtilDirectXMath::extrudeVertices(
1927         const Vector4& lightPos,
1928         Real extrudeDist,
1929         const float* pSrcPos,
1930         float* pDestPos,
1931         size_t numVertices)
1932     {
1933         // Note: Since pDestPos is following tail of pSrcPos, we can't assume
1934         // it's aligned to SIMD alignment properly, so must check for it here.
1935         //
1936         // TODO: Add extra vertex to the vertex buffer for make sure pDestPos
1937         // aligned same as pSrcPos.
1938         //
1939 
1940         // We are use DirectXMath reciprocal square root directly while calculating
1941         // extrusion direction, since precision loss not that important here.
1942         //
1943         if (lightPos.w == 0.0f)
1944         {
1945             if (_isAlignedForDirectXMath(pSrcPos))
1946             {
1947                 if (_isAlignedForDirectXMath(pDestPos))
1948                     ExtrudeVertices_DirectXMath_DirectionalLight<true, true>::apply(
1949                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1950                 else
1951                     ExtrudeVertices_DirectXMath_DirectionalLight<true, false>::apply(
1952                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1953             }
1954             else
1955             {
1956                 if (_isAlignedForDirectXMath(pDestPos))
1957                     ExtrudeVertices_DirectXMath_DirectionalLight<false, true>::apply(
1958                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1959                 else
1960                     ExtrudeVertices_DirectXMath_DirectionalLight<false, false>::apply(
1961                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1962             }
1963         }
1964         else
1965         {
1966             assert(lightPos.w == 1.0f);
1967 
1968             if (_isAlignedForDirectXMath(pSrcPos))
1969             {
1970                 if (_isAlignedForDirectXMath(pDestPos))
1971                     ExtrudeVertices_DirectXMath_PointLight<true, true>::apply(
1972                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1973                 else
1974                     ExtrudeVertices_DirectXMath_PointLight<true, false>::apply(
1975                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1976             }
1977             else
1978             {
1979                 if (_isAlignedForDirectXMath(pDestPos))
1980                     ExtrudeVertices_DirectXMath_PointLight<false, true>::apply(
1981                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1982                 else
1983                     ExtrudeVertices_DirectXMath_PointLight<false, false>::apply(
1984                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1985             }
1986         }
1987     }
1988     //---------------------------------------------------------------------
1989     //---------------------------------------------------------------------
1990     //---------------------------------------------------------------------
_getOptimisedUtilDirectXMath(void)1991     extern OptimisedUtil* _getOptimisedUtilDirectXMath(void)
1992     {
1993         static OptimisedUtilDirectXMath msOptimisedUtilDirectXMath;
1994         return &msOptimisedUtilDirectXMath;
1995     }
1996 
1997 }
1998 
1999 #endif // __OGRE_HAVE_DIRECTXMATH
2000 
2001