1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4 (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6
7 Copyright (c) 2000-2013 Torus Knot Software Ltd
8
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28
29 #include "OgreStableHeaders.h"
30
31 #include "OgreOptimisedUtil.h"
32 #include "OgrePlatformInformation.h"
33
34 #if __OGRE_HAVE_DIRECTXMATH
35
36 #include "OgreVector3.h"
37 #include "OgreMatrix4.h"
38
39 #include <directxmath.h>
40 using namespace DirectX;
41
42 // Use unrolled version when vertices exceed this limit
43 #define OGRE_DIRECTXMATH_SKINNING_UNROLL_VERTICES 16
44
45 namespace Ogre {
46
47 //-------------------------------------------------------------------------
48 // Local classes
49 //-------------------------------------------------------------------------
50
51 /** General implementation of OptimisedUtil.
52 @note
53 Don't use this class directly, use OptimisedUtil instead.
54 */
55 class _OgrePrivate OptimisedUtilDirectXMath : public OptimisedUtil
56 {
57 public:
58 /// @copydoc OptimisedUtil::softwareVertexSkinning
59 virtual void softwareVertexSkinning(
60 const float *srcPosPtr, float *destPosPtr,
61 const float *srcNormPtr, float *destNormPtr,
62 const float *blendWeightPtr, const unsigned char* blendIndexPtr,
63 const Matrix4* const* blendMatrices,
64 size_t srcPosStride, size_t destPosStride,
65 size_t srcNormStride, size_t destNormStride,
66 size_t blendWeightStride, size_t blendIndexStride,
67 size_t numWeightsPerVertex,
68 size_t numVertices);
69
70 /// @copydoc OptimisedUtil::softwareVertexMorph
71 virtual void softwareVertexMorph(
72 Real t,
73 const float *srcPos1, const float *srcPos2,
74 float *dstPos,
75 size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
76 size_t numVertices,
77 bool morphNormals);
78
79 /// @copydoc OptimisedUtil::concatenateAffineMatrices
80 virtual void concatenateAffineMatrices(
81 const Matrix4& baseMatrix,
82 const Matrix4* srcMatrices,
83 Matrix4* dstMatrices,
84 size_t numMatrices);
85
86 /// @copydoc OptimisedUtil::calculateFaceNormals
87 virtual void calculateFaceNormals(
88 const float *positions,
89 const EdgeData::Triangle *triangles,
90 Vector4 *faceNormals,
91 size_t numTriangles);
92
93 /// @copydoc OptimisedUtil::calculateLightFacing
94 virtual void calculateLightFacing(
95 const Vector4& lightPos,
96 const Vector4* faceNormals,
97 char* lightFacings,
98 size_t numFaces);
99
100 /// @copydoc OptimisedUtil::extrudeVertices
101 virtual void extrudeVertices(
102 const Vector4& lightPos,
103 Real extrudeDist,
104 const float* srcPositions,
105 float* destPositions,
106 size_t numVertices);
107 };
108
109 //---------------------------------------------------------------------
110 // DirectXMath helpers.
111 //---------------------------------------------------------------------
112
113 /** Check whether or not the given pointer perfect aligned for DirectXMath.
114 */
_isAlignedForDirectXMath(const void * p)115 static FORCEINLINE bool _isAlignedForDirectXMath(const void *p)
116 {
117 return (((size_t)p) & 15) == 0;
118 }
119
120 /// Linear interpolation
121 #define __DX_LERP_PS(t, a, b) \
122 XMVectorLerpV(a, b, t)
123
124 /// Linear interpolation. Single value lerp is not supported in DirectXMath, fallback to __DX_LERP_PS.
125 #define __DX_LERP_SS(t, a, b) \
126 __DX_LERP_PS(t, a, b)
127
128 #define __DX_LOAD_PS(p) \
129 (*(XMVECTOR*)(p))
130
131 #define __DX_STORE_PS(p, v) \
132 (*(XMVECTOR*)(p) = (v))
133
134 /// Accumulate three vector of single precision floating point values.
135 #define __DX_ACCUM3_PS(a, b, c) \
136 XMVectorAdd(XMVectorAdd(a, b), c)
137
138 /// Accumulate four vector of single precision floating point values.
139 #define __DX_ACCUM4_PS(a, b, c, d) \
140 XMVectorAdd(XMVectorAdd(a, b), XMVectorAdd(c, d))
141
142 /** Performing dot-product between two of three vector of single precision
143 floating point values.
144 */
145 #define __DX_DOT3x3_PS(r0, r1, r2, v0, v1, v2) \
146 __DX_ACCUM3_PS(XMVectorMultiply(r0, v0), XMVectorMultiply(r1, v1), XMVectorMultiply(r2, v2))
147
148 /** Performing dot-product between four vector and three vector of single
149 precision floating point values.
150 */
151 #define __DX_DOT4x3_PS(r0, r1, r2, r3, v0, v1, v2) \
152 __DX_ACCUM4_PS(XMVectorMultiply(r0, v0), XMVectorMultiply(r1, v1), XMVectorMultiply(r2, v2), r3)
153
154 /** Performing the transpose of a 4x4 matrix of single precision floating
155 point values.
156 Arguments r0, r1, r2, and r3 are XMVECTOR values whose elements
157 form the corresponding rows of a 4x4 matrix.
158 The matrix transpose is returned in arguments r0, r1, r2, and
159 r3 where r0 now holds column 0 of the original matrix, r1 now
160 holds column 1 of the original matrix, etc.
161 */
162 #define __DX_TRANSPOSE4x4_PS(r0, r1, r2, r3) \
163 { \
164 XMVECTOR tmp3, tmp2, tmp1, tmp0; \
165 \
166 /* r00 r01 r02 r03 */ \
167 /* r10 r11 r12 r13 */ \
168 /* r20 r21 r22 r23 */ \
169 /* r30 r31 r32 r33 */ \
170 \
171 tmp0 = XMVectorMergeXY(r0, r1); /* r00 r10 r01 r11 */ \
172 tmp2 = XMVectorMergeZW(r0, r1); /* r02 r12 r03 r13 */ \
173 tmp1 = XMVectorMergeXY(r2, r3); /* r20 r30 r21 r31 */ \
174 tmp3 = XMVectorMergeZW(r2, r3); /* r22 r32 r23 r33 */ \
175 \
176 r0 = XMVectorPermute<0, 1, 4, 5>(tmp0, tmp1); /* r00 r10 r20 r30 */ \
177 r1 = XMVectorPermute<6, 7, 2, 3>(tmp1, tmp0); /* r01 r11 r21 r31 */ \
178 r2 = XMVectorPermute<0, 1, 4, 5>(tmp2, tmp3); /* r02 r12 r22 r32 */ \
179 r3 = XMVectorPermute<6, 7, 2, 3>(tmp3, tmp2); /* r03 r13 r23 r33 */ \
180 }
181
182 /** Performing the transpose of a continuous stored rows of a 4x3 matrix to
183 a 3x4 matrix of single precision floating point values.
184 Arguments v0, v1, and v2 are XMVECTOR values whose elements form the
185 corresponding continuous stored rows of a 4x3 matrix.
186 The matrix transpose is returned in arguments v0, v1, and v2, where
187 v0 now holds column 0 of the original matrix, v1 now holds column 1
188 of the original matrix, etc.
189 */
190 #define __DX_TRANSPOSE4x3_PS(v0, v1, v2) \
191 { \
192 XMVECTOR tmp0, tmp1, tmp2; \
193 \
194 /* r00 r01 r02 r10 */ \
195 /* r11 r12 r20 r21 */ \
196 /* r22 r30 r31 r32 */ \
197 \
198 tmp0 = XMVectorPermute<0, 3, 4, 7>(v0, v2); /* r00 r10 r22 r32 */ \
199 tmp1 = XMVectorPermute<1, 2, 4, 5>(v0, v1); /* r01 r02 r11 r12 */ \
200 tmp2 = XMVectorPermute<2, 3, 5, 6>(v1, v2); /* r20 r21 r30 r31 */ \
201 \
202 v0 = XMVectorPermute<0, 1, 4, 6>(tmp0, tmp2); /* r00 r10 r20 r30 */ \
203 v1 = XMVectorPermute<0, 2, 5, 7>(tmp1, tmp2); /* r01 r11 r21 r31 */ \
204 v2 = XMVectorPermute<1, 3, 6, 7>(tmp1, tmp0); /* r02 r12 r22 r32 */ \
205 }
206
207 /** Performing the transpose of a 3x4 matrix to a continuous stored rows of
208 a 4x3 matrix of single precision floating point values.
209 Arguments v0, v1, and v2 are XMVECTOR values whose elements form the
210 corresponding columns of a 3x4 matrix.
211 The matrix transpose is returned in arguments v0, v1, and v2, as a
212 continuous stored rows of a 4x3 matrix.
213 */
214 #define __DX_TRANSPOSE3x4_PS(v0, v1, v2) \
215 { \
216 XMVECTOR tmp0, tmp1, tmp2; \
217 \
218 /* r00 r10 r20 r30 */ \
219 /* r01 r11 r21 r31 */ \
220 /* r02 r12 r22 r32 */ \
221 \
222 tmp0 = XMVectorPermute<1, 3, 4, 6>(v0, v2); /* r10 r30 r02 r22 */ \
223 tmp1 = XMVectorPermute<1, 3, 5, 7>(v1, v2); /* r11 r31 r12 r32 */ \
224 tmp2 = XMVectorPermute<0, 2, 4, 6>(v0, v1); /* r00 r20 r01 r21 */ \
225 \
226 v0 = XMVectorPermute<0, 2, 6, 4>(tmp2, tmp0); /* r00 r01 r02 r10 */ \
227 v1 = XMVectorPermute<0, 2, 5, 7>(tmp1, tmp2); /* r11 r12 r20 r21 */ \
228 v2 = XMVectorPermute<3, 1, 5, 7>(tmp0, tmp1); /* r22 r30 r31 r32 */ \
229 }
230
231 /** Helper to load/store DirectXMath data based on whether or not aligned.
232 */
233 template <bool aligned = false>
234 struct DirectXMathMemoryAccessor
235 {
loadOgre::DirectXMathMemoryAccessor236 static FORCEINLINE XMVECTOR load(const float *p)
237 {
238 return XMLoadFloat4(reinterpret_cast<const XMFLOAT4*>(p));
239 }
storeOgre::DirectXMathMemoryAccessor240 static FORCEINLINE void store(float *p, const XMVECTOR& v)
241 {
242 XMStoreFloat4(reinterpret_cast<XMFLOAT4*>(p), v);
243 }
244 };
245 // Special aligned accessor
246 template <>
247 struct DirectXMathMemoryAccessor<true>
248 {
loadOgre::DirectXMathMemoryAccessor249 static FORCEINLINE const XMVECTOR load(const float *p)
250 {
251 return __DX_LOAD_PS(p);
252 }
storeOgre::DirectXMathMemoryAccessor253 static FORCEINLINE void store(float *p, const XMVECTOR& v)
254 {
255 __DX_STORE_PS(p, v);
256 }
257 };
258
259 //---------------------------------------------------------------------
260 // Some useful macro for collapse matrices.
261 //---------------------------------------------------------------------
262
263 #define __LOAD_MATRIX(row0, row1, row2, pMatrix) \
264 { \
265 row0 = __DX_LOAD_PS((*pMatrix)[0]); \
266 row1 = __DX_LOAD_PS((*pMatrix)[1]); \
267 row2 = __DX_LOAD_PS((*pMatrix)[2]); \
268 }
269
270 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \
271 { \
272 row0 = XMVectorLerpV(row0, __DX_LOAD_PS((*pMatrix)[0]), weight);\
273 row1 = XMVectorLerpV(row1, __DX_LOAD_PS((*pMatrix)[1]), weight);\
274 row2 = XMVectorLerpV(row2, __DX_LOAD_PS((*pMatrix)[2]), weight);\
275 }
276
277 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \
278 { \
279 row0 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[0]), weight); \
280 row1 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[1]), weight); \
281 row2 = XMVectorMultiply(__DX_LOAD_PS((*pMatrix)[2]), weight); \
282 }
283
284 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \
285 { \
286 row0 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[0]), weight, row0); \
287 row1 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[1]), weight, row1); \
288 row2 = XMVectorMultiplyAdd(__DX_LOAD_PS((*pMatrix)[2]), weight, row2); \
289 }
290
291 //---------------------------------------------------------------------
292 // The following macros request variables declared by caller.
293 //
294 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
295 //---------------------------------------------------------------------
296
297 /** Collapse one-weighted matrix.
298 Eliminated multiply by weight since the weight should be equal to one always
299 */
300 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \
301 { \
302 pMatrix0 = blendMatrices[pIndices[0]]; \
303 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \
304 }
305
306 /** Collapse two-weighted matrix.
307 Based on the fact that accumulated weights are equal to one, by use lerp,
308 replaced two multiplies and one additive with one multiplie and two additives.
309 */
310 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \
311 { \
312 weight = XMVectorReplicatePtr(pWeights + 1); \
313 pMatrix0 = ppMatrices[pIndices[0]]; \
314 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \
315 pMatrix1 = ppMatrices[pIndices[1]]; \
316 __LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \
317 }
318
319 /** Collapse three-weighted matrix.
320 */
321 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \
322 { \
323 weight = XMVectorReplicatePtr(pWeights + 0); \
324 pMatrix0 = ppMatrices[pIndices[0]]; \
325 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \
326 weight = XMVectorReplicatePtr(pWeights + 1); \
327 pMatrix1 = ppMatrices[pIndices[1]]; \
328 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \
329 weight = XMVectorReplicatePtr(pWeights + 2); \
330 pMatrix2 = ppMatrices[pIndices[2]]; \
331 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \
332 }
333
334 /** Collapse four-weighted matrix.
335 */
336 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \
337 { \
338 /* Load four blend weights at one time, they will be shuffled later */ \
339 weights = __DX_LOAD_PS(pWeights); \
340 \
341 pMatrix0 = ppMatrices[pIndices[0]]; \
342 weight = XMVectorSplatX(weights); \
343 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \
344 pMatrix1 = ppMatrices[pIndices[1]]; \
345 weight = XMVectorSplatY(weights); \
346 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \
347 pMatrix2 = ppMatrices[pIndices[2]]; \
348 weight = XMVectorSplatZ(weights); \
349 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \
350 pMatrix3 = ppMatrices[pIndices[3]]; \
351 weight = XMVectorSplatW(weights); \
352 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \
353 }
354
355
356
357 //---------------------------------------------------------------------
358 // Collapse a matrix at one time. The collapsed matrix are weighted by
359 // blend-weights, and then can use to transform corresponding vertex directly.
360 //
361 // I'd like use inline function instead of macro here, but I also want to
362 // ensure compiler integrate this code into its callers (release build at
363 // least), doesn't matter about specific compile options. Inline function
364 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
365 // when implemented as inline function, even if compile with "-O3" option.
366 //
367 #define _collapseOneMatrix( \
368 m00, m01, m02, \
369 pBlendWeight, pBlendIndex, \
370 blendMatrices, \
371 blendWeightStride, blendIndexStride, \
372 numWeightsPerVertex) \
373 { \
374 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \
375 /* generate wrong code here!!! */ \
376 const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \
377 XMVECTOR weight, weights; \
378 \
379 switch (numWeightsPerVertex) \
380 { \
381 default: /* Just in case and make compiler happy */ \
382 case 1: \
383 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \
384 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
385 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
386 break; \
387 \
388 case 2: \
389 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \
390 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
391 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
392 break; \
393 \
394 case 3: \
395 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \
396 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
397 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
398 break; \
399 \
400 case 4: \
401 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \
402 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
403 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
404 break; \
405 } \
406 }
407
408 //---------------------------------------------------------------------
409 // Collapse four matrices at one time. The collapsed matrix are weighted by
410 // blend-weights, and then can use to transform corresponding vertex directly.
411 //
412 // I'd like use inline function instead of macro here, but I also want to
413 // ensure compiler integrate this code into its callers (release build at
414 // least), doesn't matter about specific compile options. Inline function
415 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
416 // when implemented as inline function, even if compile with "-O3" option.
417 //
418 #define _collapseFourMatrices( \
419 m00, m01, m02, \
420 m10, m11, m12, \
421 m20, m21, m22, \
422 m30, m31, m32, \
423 pBlendWeight, pBlendIndex, \
424 blendMatrices, \
425 blendWeightStride, blendIndexStride, \
426 numWeightsPerVertex) \
427 { \
428 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \
429 /* generate wrong code here!!! */ \
430 const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \
431 XMVECTOR weight, weights; \
432 \
433 switch (numWeightsPerVertex) \
434 { \
435 default: /* Just in case and make compiler happy */ \
436 case 1: \
437 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \
438 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
439 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
440 __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \
441 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
442 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
443 __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \
444 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
445 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
446 __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \
447 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
448 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
449 break; \
450 \
451 case 2: \
452 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \
453 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
454 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
455 __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \
456 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
457 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
458 __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \
459 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
460 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
461 __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \
462 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
463 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
464 break; \
465 \
466 case 3: \
467 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \
468 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
469 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
470 __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \
471 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
472 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
473 __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \
474 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
475 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
476 __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \
477 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
478 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
479 break; \
480 \
481 case 4: \
482 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \
483 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \
484 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \
485 __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \
486 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \
487 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \
488 __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \
489 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \
490 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \
491 __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \
492 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \
493 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \
494 break; \
495 } \
496 }
497
498
499
500 //---------------------------------------------------------------------
501 // General DirectXMath version skinning positions, and optional skinning normals.
softwareVertexSkinning_DirectXMath_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)502 static void softwareVertexSkinning_DirectXMath_General(
503 const float *pSrcPos, float *pDestPos,
504 const float *pSrcNorm, float *pDestNorm,
505 const float *pBlendWeight, const unsigned char* pBlendIndex,
506 const Matrix4* const* blendMatrices,
507 size_t srcPosStride, size_t destPosStride,
508 size_t srcNormStride, size_t destNormStride,
509 size_t blendWeightStride, size_t blendIndexStride,
510 size_t numWeightsPerVertex,
511 size_t numVertices)
512 {
513 for (size_t i = 0; i < numVertices; ++i)
514 {
515 // Collapse matrices
516 XMVECTOR m00, m01, m02;
517 _collapseOneMatrix(
518 m00, m01, m02,
519 pBlendWeight, pBlendIndex,
520 blendMatrices,
521 blendWeightStride, blendIndexStride,
522 numWeightsPerVertex);
523
524 // Advance blend weight and index pointers
525 advanceRawPointer(pBlendWeight, blendWeightStride);
526 advanceRawPointer(pBlendIndex, blendIndexStride);
527
528 //------------------------------------------------------------------
529
530 XMVECTOR m03 = g_XMZero;
531 __DX_TRANSPOSE4x4_PS(m00, m01, m02, m03);
532
533 //------------------------------------------------------------------
534 // Transform position
535 //------------------------------------------------------------------
536
537 XMVECTOR s0, s1, s2;
538
539 // Load source position
540 s0 = XMVectorReplicatePtr(pSrcPos + 0);
541 s1 = XMVectorReplicatePtr(pSrcPos + 1);
542 s2 = XMVectorReplicatePtr(pSrcPos + 2);
543
544 // Transform by collapsed matrix
545 XMVECTOR accumPos = __DX_DOT4x3_PS(m00, m01, m02, m03, s0, s1, s2); // x y z 0
546
547 // Store blended position, no aligned requirement
548 XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), accumPos);
549
550 // Advance source and target position pointers
551 advanceRawPointer(pSrcPos, srcPosStride);
552 advanceRawPointer(pDestPos, destPosStride);
553
554 //------------------------------------------------------------------
555 // Optional blend normal
556 //------------------------------------------------------------------
557
558 if (pSrcNorm)
559 {
560 // Load source normal
561 s0 = XMVectorReplicatePtr(pSrcNorm + 0);
562 s1 = XMVectorReplicatePtr(pSrcNorm + 1);
563 s2 = XMVectorReplicatePtr(pSrcNorm + 2);
564
565 // Transform by collapsed matrix
566 XMVECTOR accumNorm = __DX_DOT3x3_PS(m00, m01, m02, s0, s1, s2); // x y z 0
567
568 // Normalise normal
569 accumNorm = XMVector3Normalize(accumNorm);
570
571 // Store blended normal, no aligned requirement
572 XMStoreFloat3((XMFLOAT3*)(pDestNorm + 0), accumNorm);
573
574 // Advance source and target normal pointers
575 advanceRawPointer(pSrcNorm, srcNormStride);
576 advanceRawPointer(pDestNorm, destNormStride);
577 }
578 }
579 }
580 //---------------------------------------------------------------------
581 // Special DirectXMath version skinning shared buffers of position and normal,
582 // and the buffer are packed.
583 template <bool srcAligned, bool destAligned>
584 struct SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed
585 {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed586 static void apply(
587 const float* pSrc, float* pDest,
588 const float* pBlendWeight, const unsigned char* pBlendIndex,
589 const Matrix4* const* blendMatrices,
590 size_t blendWeightStride, size_t blendIndexStride,
591 size_t numWeightsPerVertex,
592 size_t numIterations)
593 {
594 typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
595 typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
596
597 // Blending 4 vertices per-iteration
598 for (size_t i = 0; i < numIterations; ++i)
599 {
600 // Collapse matrices
601 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
602 _collapseFourMatrices(
603 m00, m01, m02,
604 m10, m11, m12,
605 m20, m21, m22,
606 m30, m31, m32,
607 pBlendWeight, pBlendIndex,
608 blendMatrices,
609 blendWeightStride, blendIndexStride,
610 numWeightsPerVertex);
611
612 // Advance 4 vertices
613 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
614 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
615
616 //------------------------------------------------------------------
617 // Transform position/normals
618 //------------------------------------------------------------------
619
620 XMVECTOR s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
621 XMVECTOR t0, t1, t2, t3, t4, t5;
622
623 // Load source position/normals
624 s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0
625 s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1
626 s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1
627 s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2
628 s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3
629 s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3
630
631 // Rearrange to component-major for batches calculate.
632 t0 = XMVectorMergeXY(s0, s3); // px0 px2 py0 py2
633 t1 = XMVectorMergeZW(s0, s3); // pz0 pz2 nx0 nx2
634 t2 = XMVectorMergeXY(s1, s4); // ny0 ny2 nz0 nz2
635 t3 = XMVectorMergeZW(s1, s4); // px1 px3 py1 py3
636 t4 = XMVectorMergeXY(s2, s5); // pz1 pz3 nx1 nx3
637 t5 = XMVectorMergeZW(s2, s5); // ny1 ny3 nz1 nz3
638
639 s0 = XMVectorMergeXY(t0, t3); // px0 px1 px2 px3
640 s1 = XMVectorMergeZW(t0, t3); // py0 py1 py2 py3
641 s2 = XMVectorMergeXY(t1, t4); // pz0 pz1 pz2 pz3
642 s3 = XMVectorMergeZW(t1, t4); // nx0 nx1 nx2 nx3
643 s4 = XMVectorMergeXY(t2, t5); // ny0 ny1 ny2 ny3
644 s5 = XMVectorMergeZW(t2, t5); // nz0 nz1 nz2 nz3
645
646 // Transform by collapsed matrix
647
648 // Shuffle row 0 of four collapsed matrices for calculate X component
649 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
650
651 // Transform X components
652 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3
653 d3 = __DX_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3
654
655 // Shuffle row 1 of four collapsed matrices for calculate Y component
656 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
657
658 // Transform Y components
659 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3
660 d4 = __DX_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3
661
662 // Shuffle row 2 of four collapsed matrices for calculate Z component
663 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
664
665 // Transform Z components
666 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3
667 d5 = __DX_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3
668
669 // Normalise normals
670 XMVECTOR tmp = __DX_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
671 tmp = XMVectorReciprocalSqrtEst(tmp);
672 d3 = XMVectorMultiply(d3, tmp);
673 d4 = XMVectorMultiply(d4, tmp);
674 d5 = XMVectorMultiply(d5, tmp);
675
676 // Arrange back to continuous format for store results
677
678 t0 = XMVectorMergeXY(d0, d1); // PX0 PY0 PX1 PY1
679 t1 = XMVectorMergeZW(d0, d1); // PX2 PY2 PX3 PY3
680 t2 = XMVectorMergeXY(d2, d3); // PZ0 NX0 PZ1 NX1
681 t3 = XMVectorMergeZW(d2, d3); // PZ2 NX2 PZ3 NX3
682 t4 = XMVectorMergeXY(d4, d5); // NY0 NZ0 NY1 NZ1
683 t5 = XMVectorMergeZW(d4, d5); // NY2 NZ2 NY3 NZ3
684
685 d0 = XMVectorPermute<0, 1, 4, 5>(t0, t2); // PX0 PY0 PZ0 NX0
686 d1 = XMVectorPermute<0, 1, 6, 7>(t4, t0); // NY0 NZ0 PX1 PY1
687 d2 = XMVectorPermute<6, 7, 2, 3>(t4, t2); // PZ1 NX1 NY1 NZ1
688 d3 = XMVectorPermute<0, 1, 4, 5>(t1, t3); // PX2 PY2 PZ2 NX2
689 d4 = XMVectorPermute<0, 1, 6, 7>(t5, t1); // NY2 NZ2 PX3 PY3
690 d5 = XMVectorPermute<6, 7, 2, 3>(t5, t3); // PZ3 NX3 NY3 NZ3
691
692 // Store blended position/normals
693 DestAccessor::store(pDest + 0, d0);
694 DestAccessor::store(pDest + 4, d1);
695 DestAccessor::store(pDest + 8, d2);
696 DestAccessor::store(pDest + 12, d3);
697 DestAccessor::store(pDest + 16, d4);
698 DestAccessor::store(pDest + 20, d5);
699
700 // Advance 4 vertices
701 pSrc += 4 * (3 + 3);
702 pDest += 4 * (3 + 3);
703 }
704 }
705 };
softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)706 static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(
707 const float* pSrcPos, float* pDestPos,
708 const float* pBlendWeight, const unsigned char* pBlendIndex,
709 const Matrix4* const* blendMatrices,
710 size_t blendWeightStride, size_t blendIndexStride,
711 size_t numWeightsPerVertex,
712 size_t numIterations)
713 {
714 // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
715
716 // Instantiating two version only, since other alignment combinations are not that important.
717 if (_isAlignedForDirectXMath(pSrcPos) && _isAlignedForDirectXMath(pDestPos))
718 {
719 SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed<true, true>::apply(
720 pSrcPos, pDestPos,
721 pBlendWeight, pBlendIndex,
722 blendMatrices,
723 blendWeightStride, blendIndexStride,
724 numWeightsPerVertex,
725 numIterations);
726 }
727 else
728 {
729 SoftwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed<false, false>::apply(
730 pSrcPos, pDestPos,
731 pBlendWeight, pBlendIndex,
732 blendMatrices,
733 blendWeightStride, blendIndexStride,
734 numWeightsPerVertex,
735 numIterations);
736 }
737 }
738 //---------------------------------------------------------------------
739 // Special DirectXMath version skinning separated buffers of position and normal,
740 // both of position and normal buffer are packed.
741 template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
742 struct SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed
743 {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed744 static void apply(
745 const float* pSrcPos, float* pDestPos,
746 const float* pSrcNorm, float* pDestNorm,
747 const float* pBlendWeight, const unsigned char* pBlendIndex,
748 const Matrix4* const* blendMatrices,
749 size_t blendWeightStride, size_t blendIndexStride,
750 size_t numWeightsPerVertex,
751 size_t numIterations)
752 {
753 typedef DirectXMathMemoryAccessor<srcPosAligned> SrcPosAccessor;
754 typedef DirectXMathMemoryAccessor<destPosAligned> DestPosAccessor;
755 typedef DirectXMathMemoryAccessor<srcNormAligned> SrcNormAccessor;
756 typedef DirectXMathMemoryAccessor<destNormAligned> DestNormAccessor;
757
758 // Blending 4 vertices per-iteration
759 for (size_t i = 0; i < numIterations; ++i)
760 {
761 // Collapse matrices
762 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
763 _collapseFourMatrices(
764 m00, m01, m02,
765 m10, m11, m12,
766 m20, m21, m22,
767 m30, m31, m32,
768 pBlendWeight, pBlendIndex,
769 blendMatrices,
770 blendWeightStride, blendIndexStride,
771 numWeightsPerVertex);
772
773 // Advance 4 vertices
774 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
775 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
776
777 //------------------------------------------------------------------
778 // Transform positions
779 //------------------------------------------------------------------
780
781 XMVECTOR s0, s1, s2, d0, d1, d2;
782
783 // Load source positions
784 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
785 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
786 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
787
788 // Arrange to 3x4 component-major for batches calculate
789 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
790
791 // Transform by collapsed matrix
792
793 // Shuffle row 0 of four collapsed matrices for calculate X component
794 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
795
796 // Transform X components
797 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3
798
799 // Shuffle row 1 of four collapsed matrices for calculate Y component
800 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
801
802 // Transform Y components
803 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3
804
805 // Shuffle row 2 of four collapsed matrices for calculate Z component
806 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
807
808 // Transform Z components
809 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3
810
811 // Arrange back to 4x3 continuous format for store results
812 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
813
814 // Store blended positions
815 DestPosAccessor::store(pDestPos + 0, d0);
816 DestPosAccessor::store(pDestPos + 4, d1);
817 DestPosAccessor::store(pDestPos + 8, d2);
818
819 // Advance 4 vertices
820 pSrcPos += 4 * 3;
821 pDestPos += 4 * 3;
822
823 //------------------------------------------------------------------
824 // Transform normals
825 //------------------------------------------------------------------
826
827 // Load source normals
828 s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1
829 s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2
830 s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3
831
832 // Arrange to 3x4 component-major for batches calculate
833 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
834
835 // Transform by collapsed and shuffled matrices
836 d0 = __DX_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3
837 d1 = __DX_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3
838 d2 = __DX_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3
839
840 // Normalise normals
841 XMVECTOR tmp = __DX_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
842 tmp = XMVectorReciprocalSqrtEst(tmp);
843 d0 = XMVectorMultiply(d0, tmp);
844 d1 = XMVectorMultiply(d1, tmp);
845 d2 = XMVectorMultiply(d2, tmp);
846
847 // Arrange back to 4x3 continuous format for store results
848 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
849
850 // Store blended normals
851 DestNormAccessor::store(pDestNorm + 0, d0);
852 DestNormAccessor::store(pDestNorm + 4, d1);
853 DestNormAccessor::store(pDestNorm + 8, d2);
854
855 // Advance 4 vertices
856 pSrcNorm += 4 * 3;
857 pDestNorm += 4 * 3;
858 }
859 }
860 };
softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)861 static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(
862 const float* pSrcPos, float* pDestPos,
863 const float* pSrcNorm, float* pDestNorm,
864 const float* pBlendWeight, const unsigned char* pBlendIndex,
865 const Matrix4* const* blendMatrices,
866 size_t blendWeightStride, size_t blendIndexStride,
867 size_t numWeightsPerVertex,
868 size_t numIterations)
869 {
870 assert(_isAlignedForDirectXMath(pSrcPos));
871
872 // Instantiating two version only, since other alignement combination not that important.
873 if (_isAlignedForDirectXMath(pSrcNorm) && _isAlignedForDirectXMath(pDestPos) && _isAlignedForDirectXMath(pDestNorm))
874 {
875 SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed<true, true, true, true>::apply(
876 pSrcPos, pDestPos,
877 pSrcNorm, pDestNorm,
878 pBlendWeight, pBlendIndex,
879 blendMatrices,
880 blendWeightStride, blendIndexStride,
881 numWeightsPerVertex,
882 numIterations);
883 }
884 else
885 {
886 SoftwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed<true, false, false, false>::apply(
887 pSrcPos, pDestPos,
888 pSrcNorm, pDestNorm,
889 pBlendWeight, pBlendIndex,
890 blendMatrices,
891 blendWeightStride, blendIndexStride,
892 numWeightsPerVertex,
893 numIterations);
894 }
895 }
896 //---------------------------------------------------------------------
897 // Special DirectXMath version skinning position only, the position buffer are
898 // packed.
899 template <bool srcPosAligned, bool destPosAligned>
900 struct SoftwareVertexSkinning_DirectXMath_PosOnly_Packed
901 {
applyOgre::SoftwareVertexSkinning_DirectXMath_PosOnly_Packed902 static void apply(
903 const float* pSrcPos, float* pDestPos,
904 const float* pBlendWeight, const unsigned char* pBlendIndex,
905 const Matrix4* const* blendMatrices,
906 size_t blendWeightStride, size_t blendIndexStride,
907 size_t numWeightsPerVertex,
908 size_t numIterations)
909 {
910 typedef DirectXMathMemoryAccessor<srcPosAligned> SrcPosAccessor;
911 typedef DirectXMathMemoryAccessor<destPosAligned> DestPosAccessor;
912
913 // Blending 4 vertices per-iteration
914 for (size_t i = 0; i < numIterations; ++i)
915 {
916 // Collapse matrices
917 XMVECTOR m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
918 _collapseFourMatrices(
919 m00, m01, m02,
920 m10, m11, m12,
921 m20, m21, m22,
922 m30, m31, m32,
923 pBlendWeight, pBlendIndex,
924 blendMatrices,
925 blendWeightStride, blendIndexStride,
926 numWeightsPerVertex);
927
928 // Advance 4 vertices
929 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
930 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
931
932 //------------------------------------------------------------------
933 // Transform positions
934 //------------------------------------------------------------------
935
936 XMVECTOR s0, s1, s2, d0, d1, d2;
937
938 // Load source positions
939 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
940 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
941 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
942
943 // Arrange to 3x4 component-major for batches calculate
944 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
945
946 // Transform by collapsed matrix
947
948 // Shuffle row 0 of four collapsed matrices for calculate X component
949 __DX_TRANSPOSE4x4_PS(m00, m10, m20, m30);
950
951 // Transform X components
952 d0 = __DX_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3
953
954 // Shuffle row 1 of four collapsed matrices for calculate Y component
955 __DX_TRANSPOSE4x4_PS(m01, m11, m21, m31);
956
957 // Transform Y components
958 d1 = __DX_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3
959
960 // Shuffle row 2 of four collapsed matrices for calculate Z component
961 __DX_TRANSPOSE4x4_PS(m02, m12, m22, m32);
962
963 // Transform Z components
964 d2 = __DX_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3
965
966 // Arrange back to 4x3 continuous format for store results
967 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
968
969 // Store blended positions
970 DestPosAccessor::store(pDestPos + 0, d0);
971 DestPosAccessor::store(pDestPos + 4, d1);
972 DestPosAccessor::store(pDestPos + 8, d2);
973
974 // Advance 4 vertices
975 pSrcPos += 4 * 3;
976 pDestPos += 4 * 3;
977 }
978 }
979 };
softwareVertexSkinning_DirectXMath_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)980 static FORCEINLINE void softwareVertexSkinning_DirectXMath_PosOnly_Packed(
981 const float* pSrcPos, float* pDestPos,
982 const float* pBlendWeight, const unsigned char* pBlendIndex,
983 const Matrix4* const* blendMatrices,
984 size_t blendWeightStride, size_t blendIndexStride,
985 size_t numWeightsPerVertex,
986 size_t numIterations)
987 {
988 assert(_isAlignedForDirectXMath(pSrcPos));
989
990 // Instantiating two version only, since other alignement combination not that important.
991 if (_isAlignedForDirectXMath(pDestPos))
992 {
993 SoftwareVertexSkinning_DirectXMath_PosOnly_Packed<true, true>::apply(
994 pSrcPos, pDestPos,
995 pBlendWeight, pBlendIndex,
996 blendMatrices,
997 blendWeightStride, blendIndexStride,
998 numWeightsPerVertex,
999 numIterations);
1000 }
1001 else
1002 {
1003 SoftwareVertexSkinning_DirectXMath_PosOnly_Packed<true, false>::apply(
1004 pSrcPos, pDestPos,
1005 pBlendWeight, pBlendIndex,
1006 blendMatrices,
1007 blendWeightStride, blendIndexStride,
1008 numWeightsPerVertex,
1009 numIterations);
1010 }
1011 }
1012 //---------------------------------------------------------------------
softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1013 void OptimisedUtilDirectXMath::softwareVertexSkinning(
1014 const float *pSrcPos, float *pDestPos,
1015 const float *pSrcNorm, float *pDestNorm,
1016 const float *pBlendWeight, const unsigned char* pBlendIndex,
1017 const Matrix4* const* blendMatrices,
1018 size_t srcPosStride, size_t destPosStride,
1019 size_t srcNormStride, size_t destNormStride,
1020 size_t blendWeightStride, size_t blendIndexStride,
1021 size_t numWeightsPerVertex,
1022 size_t numVertices)
1023 {
1024 // All position/normal pointers should be perfect aligned, but still check here
1025 // for avoid hardware buffer which allocated by potential buggy driver doesn't
1026 // support alignment properly.
1027 // Because we are used meta-function technique here, the code is easy to maintenance
1028 // and still provides all possible alignment combination.
1029 //
1030
1031 // Use unrolled routines only if there a lot of vertices
1032 if (numVertices > OGRE_DIRECTXMATH_SKINNING_UNROLL_VERTICES)
1033 {
1034 if (pSrcNorm)
1035 {
1036 // Blend position and normal
1037
1038 if (srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1039 pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1040 {
1041 // Position and normal are sharing with packed buffer
1042
1043 size_t srcPosAlign = (size_t)pSrcPos & 15;
1044 assert((srcPosAlign & 3) == 0);
1045
1046 // Blend unaligned vertices with general SIMD routine
1047 if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex
1048 {
1049 size_t count = srcPosAlign / 8;
1050 numVertices -= count;
1051 softwareVertexSkinning_DirectXMath_General(
1052 pSrcPos, pDestPos,
1053 pSrcNorm, pDestNorm,
1054 pBlendWeight, pBlendIndex,
1055 blendMatrices,
1056 srcPosStride, destPosStride,
1057 srcNormStride, destNormStride,
1058 blendWeightStride, blendIndexStride,
1059 numWeightsPerVertex,
1060 count);
1061
1062 pSrcPos += count * (3 + 3);
1063 pDestPos += count * (3 + 3);
1064 pSrcNorm += count * (3 + 3);
1065 pDestNorm += count * (3 + 3);
1066 advanceRawPointer(pBlendWeight, count * blendWeightStride);
1067 advanceRawPointer(pBlendIndex, count * blendIndexStride);
1068 }
1069
1070 // Blend vertices, four vertices per-iteration
1071 size_t numIterations = numVertices / 4;
1072 softwareVertexSkinning_DirectXMath_PosNorm_Shared_Packed(
1073 pSrcPos, pDestPos,
1074 pBlendWeight, pBlendIndex,
1075 blendMatrices,
1076 blendWeightStride, blendIndexStride,
1077 numWeightsPerVertex,
1078 numIterations);
1079
1080 // Advance pointers for remaining vertices
1081 numVertices &= 3;
1082 if (numVertices)
1083 {
1084 pSrcPos += numIterations * 4 * (3 + 3);
1085 pDestPos += numIterations * 4 * (3 + 3);
1086 pSrcNorm += numIterations * 4 * (3 + 3);
1087 pDestNorm += numIterations * 4 * (3 + 3);
1088 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1089 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1090 }
1091 }
1092 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1093 srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1094 {
1095 // Position and normal are separate buffers, and all of them are packed
1096
1097 size_t srcPosAlign = (size_t)pSrcPos & 15;
1098 assert((srcPosAlign & 3) == 0);
1099
1100 // Blend unaligned vertices with general SIMD routine
1101 if (srcPosAlign)
1102 {
1103 size_t count = srcPosAlign / 4;
1104 numVertices -= count;
1105 softwareVertexSkinning_DirectXMath_General(
1106 pSrcPos, pDestPos,
1107 pSrcNorm, pDestNorm,
1108 pBlendWeight, pBlendIndex,
1109 blendMatrices,
1110 srcPosStride, destPosStride,
1111 srcNormStride, destNormStride,
1112 blendWeightStride, blendIndexStride,
1113 numWeightsPerVertex,
1114 count);
1115
1116 pSrcPos += count * 3;
1117 pDestPos += count * 3;
1118 pSrcNorm += count * 3;
1119 pDestNorm += count * 3;
1120 advanceRawPointer(pBlendWeight, count * blendWeightStride);
1121 advanceRawPointer(pBlendIndex, count * blendIndexStride);
1122 }
1123
1124 // Blend vertices, four vertices per-iteration
1125 size_t numIterations = numVertices / 4;
1126 softwareVertexSkinning_DirectXMath_PosNorm_Separated_Packed(
1127 pSrcPos, pDestPos,
1128 pSrcNorm, pDestNorm,
1129 pBlendWeight, pBlendIndex,
1130 blendMatrices,
1131 blendWeightStride, blendIndexStride,
1132 numWeightsPerVertex,
1133 numIterations);
1134
1135 // Advance pointers for remaining vertices
1136 numVertices &= 3;
1137 if (numVertices)
1138 {
1139 pSrcPos += numIterations * 4 * 3;
1140 pDestPos += numIterations * 4 * 3;
1141 pSrcNorm += numIterations * 4 * 3;
1142 pDestNorm += numIterations * 4 * 3;
1143 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1144 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1145 }
1146 }
1147 else // Not 'packed' form or wrong order between position and normal
1148 {
1149 // Should never occur, do nothing here just in case
1150 }
1151 }
1152 else // !pSrcNorm
1153 {
1154 // Blend position only
1155
1156 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1157 {
1158 // All buffers are packed
1159
1160 size_t srcPosAlign = (size_t)pSrcPos & 15;
1161 assert((srcPosAlign & 3) == 0);
1162
1163 // Blend unaligned vertices with general SIMD routine
1164 if (srcPosAlign)
1165 {
1166 size_t count = srcPosAlign / 4;
1167 numVertices -= count;
1168 softwareVertexSkinning_DirectXMath_General(
1169 pSrcPos, pDestPos,
1170 pSrcNorm, pDestNorm,
1171 pBlendWeight, pBlendIndex,
1172 blendMatrices,
1173 srcPosStride, destPosStride,
1174 srcNormStride, destNormStride,
1175 blendWeightStride, blendIndexStride,
1176 numWeightsPerVertex,
1177 count);
1178
1179 pSrcPos += count * 3;
1180 pDestPos += count * 3;
1181 advanceRawPointer(pBlendWeight, count * blendWeightStride);
1182 advanceRawPointer(pBlendIndex, count * blendIndexStride);
1183 }
1184
1185 // Blend vertices, four vertices per-iteration
1186 size_t numIterations = numVertices / 4;
1187 softwareVertexSkinning_DirectXMath_PosOnly_Packed(
1188 pSrcPos, pDestPos,
1189 pBlendWeight, pBlendIndex,
1190 blendMatrices,
1191 blendWeightStride, blendIndexStride,
1192 numWeightsPerVertex,
1193 numIterations);
1194
1195 // Advance pointers for remaining vertices
1196 numVertices &= 3;
1197 if (numVertices)
1198 {
1199 pSrcPos += numIterations * 4 * 3;
1200 pDestPos += numIterations * 4 * 3;
1201 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1202 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1203 }
1204 }
1205 else // Not 'packed' form
1206 {
1207 // Might occur only if user forced software blending position only
1208 }
1209 }
1210 }
1211
1212 // Blend remaining vertices, need to do it with SIMD for identical result,
1213 // since mixing general floating-point and SIMD algorithm will causing
1214 // floating-point error.
1215 if (numVertices)
1216 {
1217 softwareVertexSkinning_DirectXMath_General(
1218 pSrcPos, pDestPos,
1219 pSrcNorm, pDestNorm,
1220 pBlendWeight, pBlendIndex,
1221 blendMatrices,
1222 srcPosStride, destPosStride,
1223 srcNormStride, destNormStride,
1224 blendWeightStride, blendIndexStride,
1225 numWeightsPerVertex,
1226 numVertices);
1227 }
1228 }
1229 //---------------------------------------------------------------------
softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1230 void OptimisedUtilDirectXMath::softwareVertexMorph(
1231 Real t,
1232 const float *pSrc1, const float *pSrc2,
1233 float *pDst,
1234 size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
1235 size_t numVertices,
1236 bool morphNormals)
1237 {
1238 XMVECTOR src01, src02, src11, src12, src21, src22;
1239 XMVECTOR dst0, dst1, dst2;
1240
1241 XMVECTOR t4 = XMVectorReplicate(t);
1242
1243
1244 // If we're morphing normals, we have twice the number of floats to process
1245 // Positions are interleaved with normals, so we'll have to separately
1246 // normalise just the normals later; we'll just lerp in the first pass
1247 // We can't normalise as we go because normals & positions are only 3 floats
1248 // each so are not aligned for DirectXMath, we'd mix the data up
1249 size_t normalsMultiplier = morphNormals ? 2 : 1;
1250 size_t numIterations = (numVertices*normalsMultiplier) / 4;
1251 size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3;
1252
1253 // Save for later
1254 float *pStartDst = pDst;
1255
1256 // Never use meta-function technique to accessing memory because looks like
1257 // VC7.1 generate a bit inefficient binary code when put following code into
1258 // inline function.
1259
1260 if (_isAlignedForDirectXMath(pSrc1) && _isAlignedForDirectXMath(pSrc2) && _isAlignedForDirectXMath(pDst))
1261 {
1262 // All data aligned
1263
1264 // Morph 4 vertices per-iteration. Special designed for use all
1265 // available CPU registers as possible (7 registers used here),
1266 // and avoid temporary values allocated in stack for suppress
1267 // extra memory access.
1268 for (size_t i = 0; i < numIterations; ++i)
1269 {
1270 // 12 floating-point values
1271 src01 = __DX_LOAD_PS(pSrc1 + 0);
1272 src02 = __DX_LOAD_PS(pSrc2 + 0);
1273 src11 = __DX_LOAD_PS(pSrc1 + 4);
1274 src12 = __DX_LOAD_PS(pSrc2 + 4);
1275 src21 = __DX_LOAD_PS(pSrc1 + 8);
1276 src22 = __DX_LOAD_PS(pSrc2 + 8);
1277 pSrc1 += 12; pSrc2 += 12;
1278
1279 dst0 = __DX_LERP_PS(t4, src01, src02);
1280 dst1 = __DX_LERP_PS(t4, src11, src12);
1281 dst2 = __DX_LERP_PS(t4, src21, src22);
1282
1283 __DX_STORE_PS(pDst + 0, dst0);
1284 __DX_STORE_PS(pDst + 4, dst1);
1285 __DX_STORE_PS(pDst + 8, dst2);
1286 pDst += 12;
1287 }
1288
1289 // Morph remaining vertices
1290 switch (numVerticesRemainder)
1291 {
1292 case 3:
1293 // 9 floating-point values
1294 src01 = __DX_LOAD_PS(pSrc1 + 0);
1295 src02 = __DX_LOAD_PS(pSrc2 + 0);
1296 src11 = __DX_LOAD_PS(pSrc1 + 4);
1297 src12 = __DX_LOAD_PS(pSrc2 + 4);
1298 src21 = XMLoadFloat(pSrc1 + 8);
1299 src22 = XMLoadFloat(pSrc2 + 8);
1300
1301 dst0 = __DX_LERP_PS(t4, src01, src02);
1302 dst1 = __DX_LERP_PS(t4, src11, src12);
1303 dst2 = __DX_LERP_SS(t4, src21, src22);
1304
1305 __DX_STORE_PS(pDst + 0, dst0);
1306 __DX_STORE_PS(pDst + 4, dst1);
1307 XMStoreFloat(pDst + 8, dst2);
1308 break;
1309
1310 case 2:
1311 // 6 floating-point values
1312 src01 = __DX_LOAD_PS(pSrc1 + 0);
1313 src02 = __DX_LOAD_PS(pSrc2 + 0);
1314 src11 = XMLoadFloat2((XMFLOAT2*)(pSrc1 + 4));
1315 src12 = XMLoadFloat2((XMFLOAT2*)(pSrc2 + 4));
1316
1317 dst0 = __DX_LERP_PS(t4, src01, src02);
1318 dst1 = __DX_LERP_PS(t4, src11, src12);
1319
1320 __DX_STORE_PS(pDst + 0, dst0);
1321 XMStoreFloat2((XMFLOAT2*)(pDst + 4), dst1);
1322 break;
1323
1324 case 1:
1325 // 3 floating-point values
1326 src01 = XMLoadFloat3((XMFLOAT3*)(pSrc1 + 0));
1327 src02 = XMLoadFloat3((XMFLOAT3*)(pSrc2 + 0));
1328
1329 dst0 = __DX_LERP_PS(t4, src01, src02);
1330
1331 XMStoreFloat3((XMFLOAT3*)(pDst + 0), dst0);
1332 break;
1333 }
1334 }
1335 else // Should never occur, just in case of buggy drivers
1336 {
1337 // Assume all data unaligned
1338
1339 // Morph 4 vertices per-iteration. Special designed for use all
1340 // available CPU registers as possible (7 registers used here),
1341 // and avoid temporary values allocated in stack for suppress
1342 // extra memory access.
1343 for (size_t i = 0; i < numIterations; ++i)
1344 {
1345 // 12 floating-point values
1346 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1347 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1348 src11 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 4));
1349 src12 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 4));
1350 src21 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 8));
1351 src22 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 8));
1352 pSrc1 += 12; pSrc2 += 12;
1353
1354 dst0 = __DX_LERP_PS(t4, src01, src02);
1355 dst1 = __DX_LERP_PS(t4, src11, src12);
1356 dst2 = __DX_LERP_PS(t4, src21, src22);
1357
1358 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1359 XMStoreFloat4((XMFLOAT4*)(pDst + 4), dst1);
1360 XMStoreFloat4((XMFLOAT4*)(pDst + 8), dst2);
1361 pDst += 12;
1362 }
1363
1364 // Morph remaining vertices
1365 switch (numVerticesRemainder)
1366 {
1367 case 3:
1368 // 9 floating-point values
1369 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1370 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1371 src11 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 4));
1372 src12 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 4));
1373 src21 = XMLoadFloat(pSrc1 + 8);
1374 src22 = XMLoadFloat(pSrc2 + 8);
1375
1376 dst0 = __DX_LERP_PS(t4, src01, src02);
1377 dst1 = __DX_LERP_PS(t4, src11, src12);
1378 dst2 = __DX_LERP_SS(t4, src21, src22);
1379
1380 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1381 XMStoreFloat4((XMFLOAT4*)(pDst + 4), dst1);
1382 XMStoreFloat(pDst + 8, dst2);
1383 break;
1384
1385 case 2:
1386 // 6 floating-point values
1387 src01 = XMLoadFloat4((XMFLOAT4*)(pSrc1 + 0));
1388 src02 = XMLoadFloat4((XMFLOAT4*)(pSrc2 + 0));
1389 src11 = XMLoadFloat2((XMFLOAT2*)(pSrc1 + 4));
1390 src12 = XMLoadFloat2((XMFLOAT2*)(pSrc2 + 4));
1391
1392 dst0 = __DX_LERP_PS(t4, src01, src02);
1393 dst1 = __DX_LERP_PS(t4, src11, src12);
1394
1395 XMStoreFloat4((XMFLOAT4*)(pDst + 0), dst0);
1396 XMStoreFloat2((XMFLOAT2*)(pDst + 4), dst1);
1397 break;
1398
1399 case 1:
1400 // 3 floating-point values
1401 src01 = XMLoadFloat3((XMFLOAT3*)(pSrc1 + 0));
1402 src02 = XMLoadFloat3((XMFLOAT3*)(pSrc2 + 0));
1403
1404 dst0 = __DX_LERP_PS(t4, src01, src02);
1405
1406 XMStoreFloat3((XMFLOAT3*)(pDst + 0), dst0);
1407 break;
1408 }
1409
1410 }
1411
1412 if (morphNormals)
1413 {
1414
1415 // Now we need to do and unaligned normalise on the normals data we just
1416 // lerped; because normals are 3 elements each they're always unaligned
1417 float *pNorm = pStartDst;
1418
1419 // Offset past first position
1420 pNorm += 3;
1421
1422 // We'll do one normal each iteration, but still use DirectXMath
1423 for (size_t n = 0; n < numVertices; ++n)
1424 {
1425 // normalise function
1426 XMVECTOR norm;
1427
1428 // load 3 floating-point normal values
1429 norm = XMLoadFloat3((XMFLOAT3*)pNorm);
1430 norm = XMVector3Normalize(norm);
1431
1432 // Store back in the same place
1433 XMStoreFloat3((XMFLOAT3*)pNorm, norm);
1434
1435 // Skip to next vertex (3x normal components, 3x position components)
1436 pNorm += 6;
1437 }
1438 }
1439 }
1440 //---------------------------------------------------------------------
concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * pSrcMat,Matrix4 * pDstMat,size_t numMatrices)1441 void OptimisedUtilDirectXMath::concatenateAffineMatrices(
1442 const Matrix4& baseMatrix,
1443 const Matrix4* pSrcMat,
1444 Matrix4* pDstMat,
1445 size_t numMatrices)
1446 {
1447 assert(_isAlignedForDirectXMath(pSrcMat));
1448 assert(_isAlignedForDirectXMath(pDstMat));
1449
1450 // Load base matrix, unaligned
1451 XMVECTOR m0 = XMLoadFloat4((XMFLOAT4*)baseMatrix[0]);
1452 XMVECTOR m1 = XMLoadFloat4((XMFLOAT4*)baseMatrix[1]);
1453 XMVECTOR m2 = XMLoadFloat4((XMFLOAT4*)baseMatrix[2]);
1454 XMVECTOR m3 = XMLoadFloat4((XMFLOAT4*)baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1)
1455
1456 for (size_t i = 0; i < numMatrices; ++i)
1457 {
1458 // Load source matrix, aligned
1459 XMVECTOR s0 = __DX_LOAD_PS((*pSrcMat)[0]);
1460 XMVECTOR s1 = __DX_LOAD_PS((*pSrcMat)[1]);
1461 XMVECTOR s2 = __DX_LOAD_PS((*pSrcMat)[2]);
1462
1463 ++pSrcMat;
1464
1465 XMVECTOR t0, t1, t2, t3;
1466
1467 // Concatenate matrix, and store results
1468
1469 // Row 0
1470 t0 = XMVectorMultiply(XMVectorSplatX(m0), s0);
1471 t1 = XMVectorMultiply(XMVectorSplatY(m0), s1);
1472 t2 = XMVectorMultiply(XMVectorSplatZ(m0), s2);
1473 t3 = XMVectorMultiply(m0, m3); // Compiler should optimise this out of the loop
1474 __DX_STORE_PS((*pDstMat)[0], __DX_ACCUM4_PS(t0,t1,t2,t3));
1475
1476 // Row 1
1477 t0 = XMVectorMultiply(XMVectorSplatX(m1), s0);
1478 t1 = XMVectorMultiply(XMVectorSplatY(m1), s1);
1479 t2 = XMVectorMultiply(XMVectorSplatZ(m1), s2);
1480 t3 = XMVectorMultiply(m1, m3); // Compiler should optimise this out of the loop
1481 __DX_STORE_PS((*pDstMat)[1], __DX_ACCUM4_PS(t0,t1,t2,t3));
1482
1483 // Row 2
1484 t0 = XMVectorMultiply(XMVectorSplatX(m2), s0);
1485 t1 = XMVectorMultiply(XMVectorSplatY(m2), s1);
1486 t2 = XMVectorMultiply(XMVectorSplatZ(m2), s2);
1487 t3 = XMVectorMultiply(m2, m3); // Compiler should optimise this out of the loop
1488 __DX_STORE_PS((*pDstMat)[2], __DX_ACCUM4_PS(t0,t1,t2,t3));
1489
1490 // Row 3
1491 __DX_STORE_PS((*pDstMat)[3], m3);
1492
1493 ++pDstMat;
1494 }
1495 }
1496 //---------------------------------------------------------------------
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1497 void OptimisedUtilDirectXMath::calculateFaceNormals(
1498 const float *positions,
1499 const EdgeData::Triangle *triangles,
1500 Vector4 *faceNormals,
1501 size_t numTriangles)
1502 {
1503 assert(_isAlignedForDirectXMath(faceNormals));
1504
1505 size_t numIterations = numTriangles / 4;
1506 numTriangles &= 3;
1507
1508 // Four triangles per-iteration
1509 for (size_t i = 0; i < numIterations; ++i)
1510 {
1511
1512 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1513 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \
1514 { \
1515 XMVECTOR v0 = XMLoadFloat3((XMFLOAT3*)(p0)); /* x0 y0 z0 -- */ \
1516 XMVECTOR v1 = XMLoadFloat3((XMFLOAT3*)(p1)); /* x1 y1 z1 -- */ \
1517 XMVECTOR v2 = XMLoadFloat3((XMFLOAT3*)(p2)); /* x2 y2 z2 -- */ \
1518 XMVECTOR v3 = XMLoadFloat3((XMFLOAT3*)(p3)); /* x3 y3 z3 -- */ \
1519 XMVECTOR t0, t1; \
1520 \
1521 t0 = XMVectorMergeXY(v0, v2); /* x0 x2 y0 y2 */ \
1522 t1 = XMVectorMergeXY(v1, v3); /* x1 x3 y1 y3 */ \
1523 x = XMVectorMergeXY(t0, t1); /* x0 x1 x2 x3 */ \
1524 y = XMVectorMergeZW(t0, t1); /* y0 y1 y2 y3 */ \
1525 \
1526 t0 = XMVectorMergeZW(v0, v2); /* z0 z2 -- -- */ \
1527 t1 = XMVectorMergeZW(v1, v2); /* z1 z3 -- -- */ \
1528 z = XMVectorMergeXY(t0, t1); /* z0 z1 z2 z3 */ \
1529 }
1530
1531 XMVECTOR x0, x1, x2, y0, y1, y2, z0, z1, z2;
1532
1533 // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1534 __LOAD_FOUR_VECTOR3(x0, y0, z0,
1535 positions + triangles[0].vertIndex[0] * 3,
1536 positions + triangles[1].vertIndex[0] * 3,
1537 positions + triangles[2].vertIndex[0] * 3,
1538 positions + triangles[3].vertIndex[0] * 3);
1539
1540 // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1541 __LOAD_FOUR_VECTOR3(x1, y1, z1,
1542 positions + triangles[0].vertIndex[1] * 3,
1543 positions + triangles[1].vertIndex[1] * 3,
1544 positions + triangles[2].vertIndex[1] * 3,
1545 positions + triangles[3].vertIndex[1] * 3);
1546
1547 // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1548 __LOAD_FOUR_VECTOR3(x2, y2, z2,
1549 positions + triangles[0].vertIndex[2] * 3,
1550 positions + triangles[1].vertIndex[2] * 3,
1551 positions + triangles[2].vertIndex[2] * 3,
1552 positions + triangles[3].vertIndex[2] * 3);
1553
1554 triangles += 4;
1555
1556 // Calculate triangle face normals
1557
1558 // a = v1 - v0
1559 XMVECTOR ax = XMVectorSubtract(x1, x0);
1560 XMVECTOR ay = XMVectorSubtract(y1, y0);
1561 XMVECTOR az = XMVectorSubtract(z1, z0);
1562
1563 // b = v2 - v0
1564 XMVECTOR bx = XMVectorSubtract(x2, x0);
1565 XMVECTOR by = XMVectorSubtract(y2, y0);
1566 XMVECTOR bz = XMVectorSubtract(z2, z0);
1567
1568 // n = a cross b
1569 XMVECTOR nx = XMVectorSubtract(XMVectorMultiply(ay, bz), XMVectorMultiply(az, by));
1570 XMVECTOR ny = XMVectorSubtract(XMVectorMultiply(az, bx), XMVectorMultiply(ax, bz));
1571 XMVECTOR nz = XMVectorSubtract(XMVectorMultiply(ax, by), XMVectorMultiply(ay, bx));
1572
1573 // w = - (n dot v0)
1574 XMVECTOR nw = XMVectorNegate(__DX_DOT3x3_PS(nx, ny, nz, x0, y0, z0));
1575
1576 // Arrange to per-triangle face normal major format
1577 __DX_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1578
1579 // Store results
1580 __DX_STORE_PS(&faceNormals[0].x, nx);
1581 __DX_STORE_PS(&faceNormals[1].x, ny);
1582 __DX_STORE_PS(&faceNormals[2].x, nz);
1583 __DX_STORE_PS(&faceNormals[3].x, nw);
1584 faceNormals += 4;
1585 }
1586
1587 // Dealing with remaining triangles
1588 for (size_t j = 0; j < numTriangles; ++j)
1589 {
1590 // Load vertices of the triangle
1591 XMVECTOR v0 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[0] * 3));
1592 XMVECTOR v1 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[1] * 3));
1593 XMVECTOR v2 = XMLoadFloat3((XMFLOAT3*)(positions + triangles->vertIndex[2] * 3));
1594 ++triangles;
1595
1596 // Calculate face normal
1597 XMVECTOR plane = XMPlaneFromPoints(v0, v1, v2);
1598
1599 // Store result
1600 __DX_STORE_PS(&faceNormals->x, plane);
1601 ++faceNormals;
1602 }
1603 }
1604 //---------------------------------------------------------------------
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1605 void OptimisedUtilDirectXMath::calculateLightFacing(
1606 const Vector4& lightPos,
1607 const Vector4* faceNormals,
1608 char* lightFacings,
1609 size_t numFaces)
1610 {
1611 assert(_isAlignedForDirectXMath(faceNormals));
1612
1613 // Map to convert 4-bits mask to 4 byte values
1614 static const char msMaskMapping[16][4] =
1615 {
1616 {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0},
1617 {0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0},
1618 {0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1},
1619 {0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1},
1620 };
1621
1622 XMVECTOR n0, n1, n2, n3;
1623 XMVECTOR t0, t1;
1624 XMVECTOR dp;
1625 int bitmask;
1626
1627 // Load light vector, unaligned
1628 XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1629
1630 size_t numIterations = numFaces / 4;
1631 numFaces &= 3;
1632
1633 // Four faces per-iteration
1634 for (size_t i = 0; i < numIterations; ++i)
1635 {
1636 // Load face normals, aligned
1637 n0 = __DX_LOAD_PS(&faceNormals[0].x);
1638 n1 = __DX_LOAD_PS(&faceNormals[1].x);
1639 n2 = __DX_LOAD_PS(&faceNormals[2].x);
1640 n3 = __DX_LOAD_PS(&faceNormals[3].x);
1641 faceNormals += 4;
1642
1643 // Multiply by light vector
1644 n0 = XMVectorMultiply(n0, lp); // x0 y0 z0 w0
1645 n1 = XMVectorMultiply(n1, lp); // x1 y1 z1 w1
1646 n2 = XMVectorMultiply(n2, lp); // x2 y2 z2 w2
1647 n3 = XMVectorMultiply(n3, lp); // x3 y3 z3 w3
1648
1649 // Horizontal add four vector values.
1650 t0 = XMVectorAdd( // x0+z0 x1+z1 y0+w0 y1+w1
1651 XMVectorMergeXY(n0, n1), // x0 x1 y0 y1
1652 XMVectorMergeZW(n0, n1)); // z0 z1 w0 w1
1653 t1 = XMVectorAdd( // x2+z2 x3+z3 y2+w2 y3+w3
1654 XMVectorMergeXY(n2, n3), // x2 x3 y2 y3
1655 XMVectorMergeZW(n2, n3)); // z2 z3 w2 w3
1656 dp = XMVectorAdd( // dp0 dp1 dp2 dp3
1657 XMVectorPermute<0, 1, 4, 5>(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3
1658 XMVectorPermute<6, 7, 2, 3>(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3
1659
1660 bitmask = XMVector4GreaterR(dp, g_XMZero);
1661
1662 // Convert 4-bits mask to 4 bytes, and store results.
1663 *reinterpret_cast<uint32*>(lightFacings) =
1664 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]);
1665 lightFacings += 4;
1666 }
1667
1668 // Dealing with remaining faces
1669 switch (numFaces)
1670 {
1671 case 3:
1672 n0 = __DX_LOAD_PS(&faceNormals[0].x);
1673 n1 = __DX_LOAD_PS(&faceNormals[1].x);
1674 n2 = __DX_LOAD_PS(&faceNormals[2].x);
1675
1676 n0 = XMVectorMultiply(n0, lp); // x0 y0 z0 w0
1677 n1 = XMVectorMultiply(n1, lp); // x1 y1 z1 w1
1678 n2 = XMVectorMultiply(n2, lp); // x2 y2 z2 w2
1679
1680 t0 = XMVectorAdd( // x0+z0 x1+z1 y0+w0 y1+w1
1681 XMVectorMergeXY(n0, n1), // x0 x1 y0 y1
1682 XMVectorMergeZW(n0, n1)); // z0 z1 w0 w1
1683 t1 = XMVectorAdd( // x2+z2 x2+z2 y2+w2 y2+w2
1684 XMVectorMergeXY(n2, n2), // x2 x2 y2 y2
1685 XMVectorMergeZW(n2, n2)); // z2 z2 w2 w2
1686 dp = XMVectorAdd( // dp0 dp1 dp2 dp2
1687 XMVectorPermute<0, 1, 4, 5>(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2
1688 XMVectorPermute<6, 7, 2, 3>(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2
1689
1690 bitmask = XMVector4GreaterR(dp, g_XMZero);
1691
1692 lightFacings[0] = msMaskMapping[bitmask][0];
1693 lightFacings[1] = msMaskMapping[bitmask][1];
1694 lightFacings[2] = msMaskMapping[bitmask][2];
1695 break;
1696
1697 case 2:
1698 n0 = __DX_LOAD_PS(&faceNormals[0].x);
1699 n1 = __DX_LOAD_PS(&faceNormals[1].x);
1700
1701 n0 = XMVectorMultiply(n0, lp); // x0 y0 z0 w0
1702 n1 = XMVectorMultiply(n1, lp); // x1 y1 z1 w1
1703
1704 t0 = XMVectorAdd( // x0+z0 x1+z1 y0+w0 y1+w1
1705 XMVectorMergeXY(n0, n1), // x0 x1 y0 y1
1706 XMVectorMergeZW(n0, n1)); // z0 z1 w0 w1
1707 dp = XMVectorAdd( // dp0 dp1 dp0 dp1
1708 XMVectorSwizzle<0, 1, 0, 1>(t0), // x0+z0 x1+z1 x0+z0 x1+z1
1709 XMVectorSwizzle<2, 3, 2, 3>(t0)); // y0+w0 y1+w1 y0+w0 y1+w1
1710
1711 bitmask = XMVector4GreaterR(dp, g_XMZero);
1712
1713 lightFacings[0] = msMaskMapping[bitmask][0];
1714 lightFacings[1] = msMaskMapping[bitmask][1];
1715 break;
1716
1717 case 1:
1718 n0 = __DX_LOAD_PS(&faceNormals[0].x);
1719
1720 n0 = XMVectorMultiply(n0, lp); // x0 y0 z0 w0
1721
1722 t0 = XMVectorAdd( // x0+z0 x0+z0 y0+w0 y0+w0
1723 XMVectorMergeXY(n0, n0), // x0 x0 y0 y0
1724 XMVectorMergeZW(n0, n0)); // z0 z0 w0 w0
1725 dp = XMVectorAdd( // dp0 dp0 dp0 dp0
1726 XMVectorSplatX(t0), // x0+z0 x0+z0 x0+z0 x0+z0
1727 XMVectorSplatZ(t0)); // y0+w0 y0+w0 y0+w0 y0+w0
1728
1729 bitmask = XMVector4GreaterR(dp, g_XMZero);
1730
1731 lightFacings[0] = msMaskMapping[bitmask][0];
1732 break;
1733 }
1734 }
1735 //---------------------------------------------------------------------
1736 // Template to extrude vertices for directional light.
1737 template <bool srcAligned, bool destAligned>
1738 struct ExtrudeVertices_DirectXMath_DirectionalLight
1739 {
applyOgre::ExtrudeVertices_DirectXMath_DirectionalLight1740 static void apply(
1741 const Vector4& lightPos,
1742 Real extrudeDist,
1743 const float* pSrcPos,
1744 float* pDestPos,
1745 size_t numVertices)
1746 {
1747 typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
1748 typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
1749
1750 // Directional light, extrusion is along light direction
1751
1752 // Load light vector, unaligned
1753 XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1754
1755 // Calculate extrusion direction, note that we use inverted direction here
1756 // for eliminate an extra negative instruction, we'll compensate for that
1757 // by use subtract instruction instead later.
1758 XMVECTOR dir = XMVectorMultiply( // X Y Z -
1759 XMVector3NormalizeEst(lp),
1760 XMVectorReplicate(extrudeDist));
1761
1762 // Prepare extrude direction for extruding 4 vertices parallelly
1763 XMVECTOR dir0 = XMVectorSwizzle<0, 1, 2, 0>(dir); // X Y Z X
1764 XMVECTOR dir1 = XMVectorSwizzle<1, 2, 0, 1>(dir); // Y Z X Y
1765 XMVECTOR dir2 = XMVectorSwizzle<2, 0, 1, 2>(dir); // Z X Y Z
1766
1767 XMVECTOR s0, s1, s2;
1768 XMVECTOR d0, d1, d2;
1769
1770 size_t numIterations = numVertices / 4;
1771 numVertices &= 3;
1772
1773 // Extruding 4 vertices per-iteration
1774 for (size_t i = 0; i < numIterations; ++i)
1775 {
1776 s0 = SrcAccessor::load(pSrcPos + 0);
1777 s1 = SrcAccessor::load(pSrcPos + 4);
1778 s2 = SrcAccessor::load(pSrcPos + 8);
1779 pSrcPos += 12;
1780
1781 // The extrusion direction is inverted, use subtract instruction here
1782 d0 = XMVectorSubtract(s0, dir0); // X0 Y0 Z0 X1
1783 d1 = XMVectorSubtract(s1, dir1); // Y1 Z1 X2 Y2
1784 d2 = XMVectorSubtract(s2, dir2); // Z2 X3 Y3 Z3
1785
1786 DestAccessor::store(pDestPos + 0, d0);
1787 DestAccessor::store(pDestPos + 4, d1);
1788 DestAccessor::store(pDestPos + 8, d2);
1789 pDestPos += 12;
1790 }
1791
1792 // Dealing with remaining vertices
1793 switch (numVertices)
1794 {
1795 case 3:
1796 // 9 floating-point values
1797 s0 = SrcAccessor::load(pSrcPos + 0);
1798 s1 = SrcAccessor::load(pSrcPos + 4);
1799 s2 = XMLoadFloat(pSrcPos + 8);
1800
1801 // The extrusion direction is inverted, use subtract instruction here
1802 d0 = XMVectorSubtract(s0, dir0); // X0 Y0 Z0 X1
1803 d1 = XMVectorSubtract(s1, dir1); // Y1 Z1 X2 Y2
1804 d2 = XMVectorSubtract(s2, dir2); // Z2 -- -- --
1805
1806 DestAccessor::store(pDestPos + 0, d0);
1807 DestAccessor::store(pDestPos + 4, d1);
1808 XMStoreFloat(pDestPos + 8, d2);
1809 break;
1810
1811 case 2:
1812 // 6 floating-point values
1813 s0 = SrcAccessor::load(pSrcPos + 0);
1814 s1 = XMLoadFloat2((XMFLOAT2*)(pSrcPos + 4));
1815
1816 // The extrusion direction is inverted, use subtract instruction here
1817 d0 = XMVectorSubtract(s0, dir0); // X0 Y0 Z0 X1
1818 d1 = XMVectorSubtract(s1, dir1); // Y1 Z1 -- --
1819
1820 DestAccessor::store(pDestPos + 0, d0);
1821 XMStoreFloat2((XMFLOAT2*)(pDestPos + 4), d1);
1822 break;
1823
1824 case 1:
1825 // 3 floating-point values
1826 s0 = XMLoadFloat3((XMFLOAT3*)(pSrcPos + 0));
1827
1828 // The extrusion direction is inverted, use subtract instruction here
1829 d0 = XMVectorSubtract(s0, dir0); // X0 Y0 Z0 --
1830
1831 XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), d0);
1832 break;
1833 }
1834 }
1835 };
1836 //---------------------------------------------------------------------
1837 // Template to extrude vertices for point light.
1838 template <bool srcAligned, bool destAligned>
1839 struct ExtrudeVertices_DirectXMath_PointLight
1840 {
applyOgre::ExtrudeVertices_DirectXMath_PointLight1841 static void apply(
1842 const Vector4& lightPos,
1843 Real extrudeDist,
1844 const float* pSrcPos,
1845 float* pDestPos,
1846 size_t numVertices)
1847 {
1848 typedef DirectXMathMemoryAccessor<srcAligned> SrcAccessor;
1849 typedef DirectXMathMemoryAccessor<destAligned> DestAccessor;
1850
1851 // Point light, will calculate extrusion direction for every vertex
1852
1853 // Load light vector, unaligned
1854 XMVECTOR lp = XMLoadFloat4((XMFLOAT4*)(&lightPos.x));
1855
1856 // Load extrude distance
1857 XMVECTOR extrudeDist4 = XMVectorReplicate(extrudeDist);
1858
1859 size_t numIterations = numVertices / 4;
1860 numVertices &= 3;
1861
1862 // Extruding 4 vertices per-iteration
1863 for (size_t i = 0; i < numIterations; ++i)
1864 {
1865 // Load source positions
1866 XMVECTOR s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1
1867 XMVECTOR s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2
1868 XMVECTOR s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3
1869 pSrcPos += 12;
1870
1871 // Arrange to 3x4 component-major for batches calculate
1872 __DX_TRANSPOSE4x3_PS(s0, s1, s2);
1873
1874 // Calculate unnormalised extrusion direction
1875 XMVECTOR dx = XMVectorSubtract(s0, XMVectorSplatX(lp)); // X0 X1 X2 X3
1876 XMVECTOR dy = XMVectorSubtract(s1, XMVectorSplatY(lp)); // Y0 Y1 Y2 Y3
1877 XMVECTOR dz = XMVectorSubtract(s2, XMVectorSplatZ(lp)); // Z0 Z1 Z2 Z3
1878
1879 // Normalise extrusion direction and multiply by extrude distance
1880 XMVECTOR tmp = __DX_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
1881 tmp = XMVectorMultiply(XMVectorReciprocalSqrtEst(tmp), extrudeDist4);
1882 dx = XMVectorMultiply(dx, tmp);
1883 dy = XMVectorMultiply(dy, tmp);
1884 dz = XMVectorMultiply(dz, tmp);
1885
1886 // Calculate extruded positions
1887 XMVECTOR d0 = XMVectorAdd(dx, s0);
1888 XMVECTOR d1 = XMVectorAdd(dy, s1);
1889 XMVECTOR d2 = XMVectorAdd(dz, s2);
1890
1891 // Arrange back to 4x3 continuous format for store results
1892 __DX_TRANSPOSE3x4_PS(d0, d1, d2);
1893
1894 // Store extruded positions
1895 DestAccessor::store(pDestPos + 0, d0);
1896 DestAccessor::store(pDestPos + 4, d1);
1897 DestAccessor::store(pDestPos + 8, d2);
1898 pDestPos += 12;
1899 }
1900
1901 // Dealing with remaining vertices
1902 for (size_t j = 0; j < numVertices; ++j)
1903 {
1904 // Load source position
1905 XMVECTOR src = XMLoadFloat3((XMFLOAT3*)(pSrcPos + 0)); // x y z 0
1906 pSrcPos += 3;
1907
1908 // Calculate unnormalised extrusion direction
1909 XMVECTOR dir = XMVectorSubtract(src, lp); // X Y Z 0
1910
1911 // Normalise extrusion direction and multiply by extrude distance
1912 dir = XMVectorMultiply(
1913 XMVector3NormalizeEst(dir),
1914 extrudeDist4);
1915
1916 // Calculate extruded position
1917 XMVECTOR dst = XMVectorAdd(dir, src);
1918
1919 // Store extruded position
1920 XMStoreFloat3((XMFLOAT3*)(pDestPos + 0), dst);
1921 pDestPos += 3;
1922 }
1923 }
1924 };
1925 //---------------------------------------------------------------------
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)1926 void OptimisedUtilDirectXMath::extrudeVertices(
1927 const Vector4& lightPos,
1928 Real extrudeDist,
1929 const float* pSrcPos,
1930 float* pDestPos,
1931 size_t numVertices)
1932 {
1933 // Note: Since pDestPos is following tail of pSrcPos, we can't assume
1934 // it's aligned to SIMD alignment properly, so must check for it here.
1935 //
1936 // TODO: Add extra vertex to the vertex buffer for make sure pDestPos
1937 // aligned same as pSrcPos.
1938 //
1939
1940 // We are use DirectXMath reciprocal square root directly while calculating
1941 // extrusion direction, since precision loss not that important here.
1942 //
1943 if (lightPos.w == 0.0f)
1944 {
1945 if (_isAlignedForDirectXMath(pSrcPos))
1946 {
1947 if (_isAlignedForDirectXMath(pDestPos))
1948 ExtrudeVertices_DirectXMath_DirectionalLight<true, true>::apply(
1949 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1950 else
1951 ExtrudeVertices_DirectXMath_DirectionalLight<true, false>::apply(
1952 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1953 }
1954 else
1955 {
1956 if (_isAlignedForDirectXMath(pDestPos))
1957 ExtrudeVertices_DirectXMath_DirectionalLight<false, true>::apply(
1958 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1959 else
1960 ExtrudeVertices_DirectXMath_DirectionalLight<false, false>::apply(
1961 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1962 }
1963 }
1964 else
1965 {
1966 assert(lightPos.w == 1.0f);
1967
1968 if (_isAlignedForDirectXMath(pSrcPos))
1969 {
1970 if (_isAlignedForDirectXMath(pDestPos))
1971 ExtrudeVertices_DirectXMath_PointLight<true, true>::apply(
1972 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1973 else
1974 ExtrudeVertices_DirectXMath_PointLight<true, false>::apply(
1975 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1976 }
1977 else
1978 {
1979 if (_isAlignedForDirectXMath(pDestPos))
1980 ExtrudeVertices_DirectXMath_PointLight<false, true>::apply(
1981 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1982 else
1983 ExtrudeVertices_DirectXMath_PointLight<false, false>::apply(
1984 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
1985 }
1986 }
1987 }
1988 //---------------------------------------------------------------------
1989 //---------------------------------------------------------------------
1990 //---------------------------------------------------------------------
_getOptimisedUtilDirectXMath(void)1991 extern OptimisedUtil* _getOptimisedUtilDirectXMath(void)
1992 {
1993 static OptimisedUtilDirectXMath msOptimisedUtilDirectXMath;
1994 return &msOptimisedUtilDirectXMath;
1995 }
1996
1997 }
1998
1999 #endif // __OGRE_HAVE_DIRECTXMATH
2000
2001