1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4     (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6 
7 Copyright (c) 2000-2013 Torus Knot Software Ltd
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28 #include "OgreStableHeaders.h"
29 
30 #include "OgreOptimisedUtil.h"
31 #include "OgrePlatformInformation.h"
32 
33 #if __OGRE_HAVE_SSE
34 
35 #include "OgreMatrix4.h"
36 
37 // Should keep this includes at latest to avoid potential "xmmintrin.h" included by
38 // other header file on some platform for some reason.
39 #include "OgreSIMDHelper.h"
40 
41 // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's
42 // impossible when compile with gcc, due SSE instructions can only
43 // enable/disable at file level.
44 
45 //-------------------------------------------------------------------------
46 //
47 // The routines implemented in this file are performance oriented,
48 // which means saving every penny as possible. This requirement might
49 // break some C++/STL-rules.
50 //
51 //
52 // Some rules I'd like to respects:
53 //
54 // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because
55 //    it can saving one byte of binary code :)
56 // 2. Use add/sub instead of mul.
57 // 3. Eliminate prolog code of function call.
58 //
59 // The last, anything recommended by Intel Optimization Reference Manual.
60 //
61 //-------------------------------------------------------------------------
62 
63 // Use unrolled SSE version when vertices exceeds this limit
64 #define OGRE_SSE_SKINNING_UNROLL_VERTICES  16
65 
66 namespace Ogre {
67 
68 //-------------------------------------------------------------------------
69 // Local classes
70 //-------------------------------------------------------------------------
71 
72     /** SSE implementation of OptimisedUtil.
73     @note
74         Don't use this class directly, use OptimisedUtil instead.
75     */
76     class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil
77     {
78     protected:
79         /// Do we prefer to use a general SSE version for position/normal shared buffers?
80         bool mPreferGeneralVersionForSharedBuffers;
81 
82     public:
83         /// Constructor
84         OptimisedUtilSSE(void);
85 
86         /// @copydoc OptimisedUtil::softwareVertexSkinning
87         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning(
88             const float *srcPosPtr, float *destPosPtr,
89             const float *srcNormPtr, float *destNormPtr,
90             const float *blendWeightPtr, const unsigned char* blendIndexPtr,
91             const Matrix4* const* blendMatrices,
92             size_t srcPosStride, size_t destPosStride,
93             size_t srcNormStride, size_t destNormStride,
94             size_t blendWeightStride, size_t blendIndexStride,
95             size_t numWeightsPerVertex,
96             size_t numVertices);
97 
98         /// @copydoc OptimisedUtil::softwareVertexMorph
99         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph(
100             Real t,
101             const float *srcPos1, const float *srcPos2,
102             float *dstPos,
103 			size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
104             size_t numVertices,
105 			bool morphNormals);
106 
107         /// @copydoc OptimisedUtil::concatenateAffineMatrices
108         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices(
109             const Matrix4& baseMatrix,
110             const Matrix4* srcMatrices,
111             Matrix4* dstMatrices,
112             size_t numMatrices);
113 
114         /// @copydoc OptimisedUtil::calculateFaceNormals
115         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals(
116             const float *positions,
117             const EdgeData::Triangle *triangles,
118             Vector4 *faceNormals,
119             size_t numTriangles);
120 
121         /// @copydoc OptimisedUtil::calculateLightFacing
122         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing(
123             const Vector4& lightPos,
124             const Vector4* faceNormals,
125             char* lightFacings,
126             size_t numFaces);
127 
128         /// @copydoc OptimisedUtil::extrudeVertices
129         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices(
130             const Vector4& lightPos,
131             Real extrudeDist,
132             const float* srcPositions,
133             float* destPositions,
134             size_t numVertices);
135     };
136 
137 #if defined(__OGRE_SIMD_ALIGN_STACK)
138     /** Stack-align implementation of OptimisedUtil.
139     @remarks
140         User code compiled by icc and gcc might not align stack
141         properly, we need ensure stack align to a 16-bytes boundary
142         when execute SSE function.
143     @par
144         We implemeted as align stack following a virtual function call,
145         then should guarantee call instruction are used instead of inline
146         underlying function body here (which might causing problem).
147     @note
148         Don't use this class directly, use OptimisedUtil instead.
149     */
150     class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil
151     {
152     protected:
153         /// The actual implementation
154         OptimisedUtil* mImpl;
155 
156     public:
157         /// Constructor
OptimisedUtilWithStackAlign(OptimisedUtil * impl)158         OptimisedUtilWithStackAlign(OptimisedUtil* impl)
159             : mImpl(impl)
160         {
161         }
162 
163         /// @copydoc OptimisedUtil::softwareVertexSkinning
softwareVertexSkinning(const float * srcPosPtr,float * destPosPtr,const float * srcNormPtr,float * destNormPtr,const float * blendWeightPtr,const unsigned char * blendIndexPtr,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)164         virtual void softwareVertexSkinning(
165             const float *srcPosPtr, float *destPosPtr,
166             const float *srcNormPtr, float *destNormPtr,
167             const float *blendWeightPtr, const unsigned char* blendIndexPtr,
168             const Matrix4* const* blendMatrices,
169             size_t srcPosStride, size_t destPosStride,
170             size_t srcNormStride, size_t destNormStride,
171             size_t blendWeightStride, size_t blendIndexStride,
172             size_t numWeightsPerVertex,
173             size_t numVertices)
174         {
175             __OGRE_SIMD_ALIGN_STACK();
176 
177             mImpl->softwareVertexSkinning(
178                 srcPosPtr, destPosPtr,
179                 srcNormPtr, destNormPtr,
180                 blendWeightPtr, blendIndexPtr,
181                 blendMatrices,
182                 srcPosStride, destPosStride,
183                 srcNormStride, destNormStride,
184                 blendWeightStride, blendIndexStride,
185                 numWeightsPerVertex,
186                 numVertices);
187         }
188 
189         /// @copydoc OptimisedUtil::softwareVertexMorph
softwareVertexMorph(Real t,const float * srcPos1,const float * srcPos2,float * dstPos,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)190         virtual void softwareVertexMorph(
191             Real t,
192             const float *srcPos1, const float *srcPos2,
193             float *dstPos,
194 			size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
195             size_t numVertices,
196 			bool morphNormals)
197         {
198             __OGRE_SIMD_ALIGN_STACK();
199 
200             mImpl->softwareVertexMorph(
201                 t,
202                 srcPos1, srcPos2,
203                 dstPos,
204 				pos1VSize, pos2VSize, dstVSize,
205                 numVertices,
206 				morphNormals);
207         }
208 
209         /// @copydoc OptimisedUtil::concatenateAffineMatrices
concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * srcMatrices,Matrix4 * dstMatrices,size_t numMatrices)210         virtual void concatenateAffineMatrices(
211             const Matrix4& baseMatrix,
212             const Matrix4* srcMatrices,
213             Matrix4* dstMatrices,
214             size_t numMatrices)
215         {
216             __OGRE_SIMD_ALIGN_STACK();
217 
218             mImpl->concatenateAffineMatrices(
219                 baseMatrix,
220                 srcMatrices,
221                 dstMatrices,
222                 numMatrices);
223         }
224 
225         /// @copydoc OptimisedUtil::calculateFaceNormals
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)226         virtual void calculateFaceNormals(
227             const float *positions,
228             const EdgeData::Triangle *triangles,
229             Vector4 *faceNormals,
230             size_t numTriangles)
231         {
232             __OGRE_SIMD_ALIGN_STACK();
233 
234             mImpl->calculateFaceNormals(
235                 positions,
236                 triangles,
237                 faceNormals,
238                 numTriangles);
239         }
240 
241         /// @copydoc OptimisedUtil::calculateLightFacing
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)242         virtual void calculateLightFacing(
243             const Vector4& lightPos,
244             const Vector4* faceNormals,
245             char* lightFacings,
246             size_t numFaces)
247         {
248             __OGRE_SIMD_ALIGN_STACK();
249 
250             mImpl->calculateLightFacing(
251                 lightPos,
252                 faceNormals,
253                 lightFacings,
254                 numFaces);
255         }
256 
257         /// @copydoc OptimisedUtil::extrudeVertices
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * srcPositions,float * destPositions,size_t numVertices)258         virtual void extrudeVertices(
259             const Vector4& lightPos,
260             Real extrudeDist,
261             const float* srcPositions,
262             float* destPositions,
263             size_t numVertices)
264         {
265             __OGRE_SIMD_ALIGN_STACK();
266 
267             mImpl->extrudeVertices(
268                 lightPos,
269                 extrudeDist,
270                 srcPositions,
271                 destPositions,
272                 numVertices);
273         }
274     };
275 #endif  // !defined(__OGRE_SIMD_ALIGN_STACK)
276 
277 //---------------------------------------------------------------------
278 // Some useful macro for collapse matrices.
279 //---------------------------------------------------------------------
280 
281 #define __LOAD_MATRIX(row0, row1, row2, pMatrix)                        \
282     {                                                                   \
283         row0 = __MM_LOAD_PS((*pMatrix)[0]);                             \
284         row1 = __MM_LOAD_PS((*pMatrix)[1]);                             \
285         row2 = __MM_LOAD_PS((*pMatrix)[2]);                             \
286     }
287 
288 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix)                \
289     {                                                                   \
290         row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \
291         row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \
292         row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \
293     }
294 
295 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)       \
296     {                                                                   \
297         row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight);         \
298         row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight);         \
299         row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight);         \
300     }
301 
302 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)      \
303     {                                                                   \
304         row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \
305         row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \
306         row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \
307     }
308 
309 //---------------------------------------------------------------------
310 // The following macros request variables declared by caller.
311 //
312 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
313 //---------------------------------------------------------------------
314 
315 /** Collapse one-weighted matrix.
316     Eliminated multiply by weight since the weight should be equal to one always
317 */
318 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
319     {                                                                           \
320         pMatrix0 = blendMatrices[pIndices[0]];                                  \
321         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
322     }
323 
324 /** Collapse two-weighted matrix.
325     Based on the fact that accumulated weights are equal to one, by use lerp,
326     replaced two multiplies and one additive with one multiplie and two additives.
327 */
328 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
329     {                                                                           \
330         weight = _mm_load_ps1(pWeights + 1);                                    \
331         pMatrix0 = ppMatrices[pIndices[0]];                                     \
332         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
333         pMatrix1 = ppMatrices[pIndices[1]];                                     \
334         __LERP_MATRIX(row0, row1, row2, weight, pMatrix1);                      \
335     }
336 
337 /** Collapse three-weighted matrix.
338 */
339 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
340     {                                                                           \
341         weight = _mm_load_ps1(pWeights + 0);                                    \
342         pMatrix0 = ppMatrices[pIndices[0]];                                     \
343         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
344         weight = _mm_load_ps1(pWeights + 1);                                    \
345         pMatrix1 = ppMatrices[pIndices[1]];                                     \
346         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
347         weight = _mm_load_ps1(pWeights + 2);                                    \
348         pMatrix2 = ppMatrices[pIndices[2]];                                     \
349         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
350     }
351 
352 /** Collapse four-weighted matrix.
353 */
354 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
355     {                                                                           \
356         /* Load four blend weights at one time, they will be shuffled later */  \
357         weights = _mm_loadu_ps(pWeights);                                       \
358                                                                                 \
359         pMatrix0 = ppMatrices[pIndices[0]];                                     \
360         weight = __MM_SELECT(weights, 0);                                       \
361         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
362         pMatrix1 = ppMatrices[pIndices[1]];                                     \
363         weight = __MM_SELECT(weights, 1);                                       \
364         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
365         pMatrix2 = ppMatrices[pIndices[2]];                                     \
366         weight = __MM_SELECT(weights, 2);                                       \
367         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
368         pMatrix3 = ppMatrices[pIndices[3]];                                     \
369         weight = __MM_SELECT(weights, 3);                                       \
370         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3);            \
371     }
372 
373 
374 
375     //---------------------------------------------------------------------
376     // Collapse a matrix at one time. The collapsed matrix are weighted by
377     // blend-weights, and then can use to transform corresponding vertex directly.
378     //
379     // I'd like use inline function instead of macro here, but I also want to
380     // ensure compiler integrate this code into its callers (release build at
381     // least), doesn't matter about specific compile options. Inline function
382     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
383     // when implemented as inline function, even if compile with "-O3" option.
384     //
385 #define _collapseOneMatrix(                                                     \
386         m00, m01, m02,                                                          \
387         pBlendWeight, pBlendIndex,                                              \
388         blendMatrices,                                                          \
389         blendWeightStride, blendIndexStride,                                    \
390         numWeightsPerVertex)                                                    \
391     {                                                                           \
392         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
393         /* generate wrong code here!!!                                   */     \
394         const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
395         __m128 weight, weights;                                                 \
396                                                                                 \
397         switch (numWeightsPerVertex)                                            \
398         {                                                                       \
399         default:    /* Just in case and make compiler happy */                  \
400         case 1:                                                                 \
401             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
402                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
403                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
404             break;                                                              \
405                                                                                 \
406         case 2:                                                                 \
407             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
408                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
409                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
410             break;                                                              \
411                                                                                 \
412         case 3:                                                                 \
413             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
414                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
415                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
416             break;                                                              \
417                                                                                 \
418         case 4:                                                                 \
419             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
420                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
421                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
422             break;                                                              \
423         }                                                                       \
424     }
425 
426     //---------------------------------------------------------------------
427     // Collapse four matrices at one time. The collapsed matrix are weighted by
428     // blend-weights, and then can use to transform corresponding vertex directly.
429     //
430     // I'd like use inline function instead of macro here, but I also want to
431     // ensure compiler integrate this code into its callers (release build at
432     // least), doesn't matter about specific compile options. Inline function
433     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
434     // when implemented as inline function, even if compile with "-O3" option.
435     //
436 #define _collapseFourMatrices(                                                  \
437         m00, m01, m02,                                                          \
438         m10, m11, m12,                                                          \
439         m20, m21, m22,                                                          \
440         m30, m31, m32,                                                          \
441         pBlendWeight, pBlendIndex,                                              \
442         blendMatrices,                                                          \
443         blendWeightStride, blendIndexStride,                                    \
444         numWeightsPerVertex)                                                    \
445     {                                                                           \
446         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
447         /* generate wrong code here!!!                                   */     \
448         const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
449         __m128 weight, weights;                                                 \
450                                                                                 \
451         switch (numWeightsPerVertex)                                            \
452         {                                                                       \
453         default:    /* Just in case and make compiler happy */                  \
454         case 1:                                                                 \
455             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
456                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
457                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
458             __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices,                  \
459                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
460                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
461             __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices,                  \
462                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
463                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
464             __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices,                  \
465                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
466                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
467             break;                                                              \
468                                                                                 \
469         case 2:                                                                 \
470             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
471                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
472                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
473             __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices,                  \
474                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
475                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
476             __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices,                  \
477                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
478                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
479             __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices,                  \
480                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
481                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
482             break;                                                              \
483                                                                                 \
484         case 3:                                                                 \
485             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
486                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
487                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
488             __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices,                  \
489                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
490                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
491             __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices,                  \
492                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
493                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
494             __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices,                  \
495                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
496                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
497             break;                                                              \
498                                                                                 \
499         case 4:                                                                 \
500             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
501                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
502                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
503             __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices,                  \
504                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
505                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
506             __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices,                  \
507                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
508                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
509             __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices,                  \
510                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
511                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
512             break;                                                              \
513         }                                                                       \
514     }
515 
516 
517     //---------------------------------------------------------------------
518     // General SSE version skinning positions, and optional skinning normals.
softwareVertexSkinning_SSE_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)519     static void softwareVertexSkinning_SSE_General(
520         const float *pSrcPos, float *pDestPos,
521         const float *pSrcNorm, float *pDestNorm,
522         const float *pBlendWeight, const unsigned char* pBlendIndex,
523         const Matrix4* const* blendMatrices,
524         size_t srcPosStride, size_t destPosStride,
525         size_t srcNormStride, size_t destNormStride,
526         size_t blendWeightStride, size_t blendIndexStride,
527         size_t numWeightsPerVertex,
528         size_t numVertices)
529     {
530         for (size_t i = 0; i < numVertices; ++i)
531         {
532             // Collapse matrices
533             __m128 m00, m01, m02;
534             _collapseOneMatrix(
535                 m00, m01, m02,
536                 pBlendWeight, pBlendIndex,
537                 blendMatrices,
538                 blendWeightStride, blendIndexStride,
539                 numWeightsPerVertex);
540 
541             // Advance blend weight and index pointers
542             advanceRawPointer(pBlendWeight, blendWeightStride);
543             advanceRawPointer(pBlendIndex, blendIndexStride);
544 
545             //------------------------------------------------------------------
546 
547             // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y
548             __m128 m03 = _mm_setzero_ps();
549             __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01);
550 
551             //------------------------------------------------------------------
552             // Transform position
553             //------------------------------------------------------------------
554 
555             __m128 s0, s1, s2;
556 
557             // Load source position
558             s0 = _mm_load_ps1(pSrcPos + 0);
559             s1 = _mm_load_ps1(pSrcPos + 1);
560             s2 = _mm_load_ps1(pSrcPos + 2);
561 
562             // Transform by collapsed matrix
563             __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2);   // z 0 x y
564 
565             // Store blended position, no aligned requirement
566             _mm_storeh_pi((__m64*)pDestPos, accumPos);
567             _mm_store_ss(pDestPos+2, accumPos);
568 
569             // Advance source and target position pointers
570             advanceRawPointer(pSrcPos, srcPosStride);
571             advanceRawPointer(pDestPos, destPosStride);
572 
573             //------------------------------------------------------------------
574             // Optional blend normal
575             //------------------------------------------------------------------
576 
577             if (pSrcNorm)
578             {
579                 // Load source normal
580                 s0 = _mm_load_ps1(pSrcNorm + 0);
581                 s1 = _mm_load_ps1(pSrcNorm + 1);
582                 s2 = _mm_load_ps1(pSrcNorm + 2);
583 
584                 // Transform by collapsed matrix
585                 __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2);   // z 0 x y
586 
587                 // Normalise normal
588                 __m128 tmp = _mm_mul_ps(accumNorm, accumNorm);                  // z^2 0 x^2 y^2
589                 tmp = __MM_ACCUM3_PS(tmp,
590                         _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)),         // x^2 0 y^2 z^2
591                         _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3)));        // y^2 0 z^2 x^2
592                 // Note: zero divided here, but neglectable
593                 tmp = __MM_RSQRT_PS(tmp);
594                 accumNorm = _mm_mul_ps(accumNorm, tmp);
595 
596                 // Store blended normal, no aligned requirement
597                 _mm_storeh_pi((__m64*)pDestNorm, accumNorm);
598                 _mm_store_ss(pDestNorm+2, accumNorm);
599 
600                 // Advance source and target normal pointers
601                 advanceRawPointer(pSrcNorm, srcNormStride);
602                 advanceRawPointer(pDestNorm, destNormStride);
603             }
604         }
605     }
606     //---------------------------------------------------------------------
607     // Special SSE version skinning shared buffers of position and normal,
608     // and the buffer are packed.
609     template <bool srcAligned, bool destAligned>
610     struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed
611     {
applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed612         static void apply(
613             const float* pSrc, float* pDest,
614             const float* pBlendWeight, const unsigned char* pBlendIndex,
615             const Matrix4* const* blendMatrices,
616             size_t blendWeightStride, size_t blendIndexStride,
617             size_t numWeightsPerVertex,
618             size_t numIterations)
619         {
620             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
621             typedef SSEMemoryAccessor<destAligned> DestAccessor;
622 
623             // Blending 4 vertices per-iteration
624             for (size_t i = 0; i < numIterations; ++i)
625             {
626                 // Collapse matrices
627                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
628                 _collapseFourMatrices(
629                     m00, m01, m02,
630                     m10, m11, m12,
631                     m20, m21, m22,
632                     m30, m31, m32,
633                     pBlendWeight, pBlendIndex,
634                     blendMatrices,
635                     blendWeightStride, blendIndexStride,
636                     numWeightsPerVertex);
637 
638                 // Advance 4 vertices
639                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
640                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
641 
642                 //------------------------------------------------------------------
643                 // Transform position/normals
644                 //------------------------------------------------------------------
645 
646                 __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
647                 __m128 t0, t1, t2, t3, t4, t5;
648 
649                 // Load source position/normals
650                 s0 = SrcAccessor::load(pSrc + 0);                       // px0 py0 pz0 nx0
651                 s1 = SrcAccessor::load(pSrc + 4);                       // ny0 nz0 px1 py1
652                 s2 = SrcAccessor::load(pSrc + 8);                       // pz1 nx1 ny1 nz1
653                 s3 = SrcAccessor::load(pSrc + 12);                      // px2 py2 pz2 nx2
654                 s4 = SrcAccessor::load(pSrc + 16);                      // ny2 nz2 px3 py3
655                 s5 = SrcAccessor::load(pSrc + 20);                      // pz3 nx3 ny3 nz3
656 
657                 // Rearrange to component-major for batches calculate.
658 
659                 t0 = _mm_unpacklo_ps(s0, s3);                           // px0 px2 py0 py2
660                 t1 = _mm_unpackhi_ps(s0, s3);                           // pz0 pz2 nx0 nx2
661                 t2 = _mm_unpacklo_ps(s1, s4);                           // ny0 ny2 nz0 nz2
662                 t3 = _mm_unpackhi_ps(s1, s4);                           // px1 px3 py1 py3
663                 t4 = _mm_unpacklo_ps(s2, s5);                           // pz1 pz3 nx1 nx3
664                 t5 = _mm_unpackhi_ps(s2, s5);                           // ny1 ny3 nz1 nz3
665 
666                 s0 = _mm_unpacklo_ps(t0, t3);                           // px0 px1 px2 px3
667                 s1 = _mm_unpackhi_ps(t0, t3);                           // py0 py1 py2 py3
668                 s2 = _mm_unpacklo_ps(t1, t4);                           // pz0 pz1 pz2 pz3
669                 s3 = _mm_unpackhi_ps(t1, t4);                           // nx0 nx1 nx2 nx3
670                 s4 = _mm_unpacklo_ps(t2, t5);                           // ny0 ny1 ny2 ny3
671                 s5 = _mm_unpackhi_ps(t2, t5);                           // nz0 nz1 nz2 nz3
672 
673                 // Transform by collapsed matrix
674 
675                 // Shuffle row 0 of four collapsed matrices for calculate X component
676                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
677 
678                 // Transform X components
679                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // PX0 PX1 PX2 PX3
680                 d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5);         // NX0 NX1 NX2 NX3
681 
682                 // Shuffle row 1 of four collapsed matrices for calculate Y component
683                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
684 
685                 // Transform Y components
686                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // PY0 PY1 PY2 PY3
687                 d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5);         // NY0 NY1 NY2 NY3
688 
689                 // Shuffle row 2 of four collapsed matrices for calculate Z component
690                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
691 
692                 // Transform Z components
693                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // PZ0 PZ1 PZ2 PZ3
694                 d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5);         // NZ0 NZ1 NZ2 NZ3
695 
696                 // Normalise normals
697                 __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
698                 tmp = __MM_RSQRT_PS(tmp);
699                 d3 = _mm_mul_ps(d3, tmp);
700                 d4 = _mm_mul_ps(d4, tmp);
701                 d5 = _mm_mul_ps(d5, tmp);
702 
703                 // Arrange back to continuous format for store results
704 
705                 t0 = _mm_unpacklo_ps(d0, d1);                           // PX0 PY0 PX1 PY1
706                 t1 = _mm_unpackhi_ps(d0, d1);                           // PX2 PY2 PX3 PY3
707                 t2 = _mm_unpacklo_ps(d2, d3);                           // PZ0 NX0 PZ1 NX1
708                 t3 = _mm_unpackhi_ps(d2, d3);                           // PZ2 NX2 PZ3 NX3
709                 t4 = _mm_unpacklo_ps(d4, d5);                           // NY0 NZ0 NY1 NZ1
710                 t5 = _mm_unpackhi_ps(d4, d5);                           // NY2 NZ2 NY3 NZ3
711 
712                 d0 = _mm_movelh_ps(t0, t2);                             // PX0 PY0 PZ0 NX0
713                 d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0));      // NY0 NZ0 PX1 PY1
714                 d2 = _mm_movehl_ps(t4, t2);                             // PZ1 NX1 NY1 NZ1
715                 d3 = _mm_movelh_ps(t1, t3);                             // PX2 PY2 PZ2 NX2
716                 d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0));      // NY2 NZ2 PX3 PY3
717                 d5 = _mm_movehl_ps(t5, t3);                             // PZ3 NX3 NY3 NZ3
718 
719                 // Store blended position/normals
720                 DestAccessor::store(pDest + 0, d0);
721                 DestAccessor::store(pDest + 4, d1);
722                 DestAccessor::store(pDest + 8, d2);
723                 DestAccessor::store(pDest + 12, d3);
724                 DestAccessor::store(pDest + 16, d4);
725                 DestAccessor::store(pDest + 20, d5);
726 
727                 // Advance 4 vertices
728                 pSrc += 4 * (3 + 3);
729                 pDest += 4 * (3 + 3);
730             }
731         }
732     };
softwareVertexSkinning_SSE_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)733     static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
734             const float* pSrcPos, float* pDestPos,
735             const float* pBlendWeight, const unsigned char* pBlendIndex,
736             const Matrix4* const* blendMatrices,
737             size_t blendWeightStride, size_t blendIndexStride,
738             size_t numWeightsPerVertex,
739             size_t numIterations)
740     {
741         // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
742 
743         // Instantiating two version only, since other alignment combinations are not that important.
744         if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos))
745         {
746             SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(
747                 pSrcPos, pDestPos,
748                 pBlendWeight, pBlendIndex,
749                 blendMatrices,
750                 blendWeightStride, blendIndexStride,
751                 numWeightsPerVertex,
752                 numIterations);
753         }
754         else
755         {
756             SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(
757                 pSrcPos, pDestPos,
758                 pBlendWeight, pBlendIndex,
759                 blendMatrices,
760                 blendWeightStride, blendIndexStride,
761                 numWeightsPerVertex,
762                 numIterations);
763         }
764     }
765     //---------------------------------------------------------------------
766     // Special SSE version skinning separated buffers of position and normal,
767     // both of position and normal buffer are packed.
768     template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
769     struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed
770     {
applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed771         static void apply(
772             const float* pSrcPos, float* pDestPos,
773             const float* pSrcNorm, float* pDestNorm,
774             const float* pBlendWeight, const unsigned char* pBlendIndex,
775             const Matrix4* const* blendMatrices,
776             size_t blendWeightStride, size_t blendIndexStride,
777             size_t numWeightsPerVertex,
778             size_t numIterations)
779         {
780             typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
781             typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
782             typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor;
783             typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor;
784 
785             // Blending 4 vertices per-iteration
786             for (size_t i = 0; i < numIterations; ++i)
787             {
788                 // Collapse matrices
789                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
790                 _collapseFourMatrices(
791                     m00, m01, m02,
792                     m10, m11, m12,
793                     m20, m21, m22,
794                     m30, m31, m32,
795                     pBlendWeight, pBlendIndex,
796                     blendMatrices,
797                     blendWeightStride, blendIndexStride,
798                     numWeightsPerVertex);
799 
800                 // Advance 4 vertices
801                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
802                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
803 
804                 //------------------------------------------------------------------
805                 // Transform positions
806                 //------------------------------------------------------------------
807 
808                 __m128 s0, s1, s2, d0, d1, d2;
809 
810                 // Load source positions
811                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
812                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
813                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
814 
815                 // Arrange to 3x4 component-major for batches calculate
816                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
817 
818                 // Transform by collapsed matrix
819 
820                 // Shuffle row 0 of four collapsed matrices for calculate X component
821                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
822 
823                 // Transform X components
824                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
825 
826                 // Shuffle row 1 of four collapsed matrices for calculate Y component
827                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
828 
829                 // Transform Y components
830                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
831 
832                 // Shuffle row 2 of four collapsed matrices for calculate Z component
833                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
834 
835                 // Transform Z components
836                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
837 
838                 // Arrange back to 4x3 continuous format for store results
839                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
840 
841                 // Store blended positions
842                 DestPosAccessor::store(pDestPos + 0, d0);
843                 DestPosAccessor::store(pDestPos + 4, d1);
844                 DestPosAccessor::store(pDestPos + 8, d2);
845 
846                 // Advance 4 vertices
847                 pSrcPos += 4 * 3;
848                 pDestPos += 4 * 3;
849 
850                 //------------------------------------------------------------------
851                 // Transform normals
852                 //------------------------------------------------------------------
853 
854                 // Load source normals
855                 s0 = SrcNormAccessor::load(pSrcNorm + 0);               // x0 y0 z0 x1
856                 s1 = SrcNormAccessor::load(pSrcNorm + 4);               // y1 z1 x2 y2
857                 s2 = SrcNormAccessor::load(pSrcNorm + 8);               // z2 x3 y3 z3
858 
859                 // Arrange to 3x4 component-major for batches calculate
860                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
861 
862                 // Transform by collapsed and shuffled matrices
863                 d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2);         // X0 X1 X2 X3
864                 d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2);         // Y0 Y1 Y2 Y3
865                 d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2);         // Z0 Z1 Z2 Z3
866 
867                 // Normalise normals
868                 __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
869                 tmp = __MM_RSQRT_PS(tmp);
870                 d0 = _mm_mul_ps(d0, tmp);
871                 d1 = _mm_mul_ps(d1, tmp);
872                 d2 = _mm_mul_ps(d2, tmp);
873 
874                 // Arrange back to 4x3 continuous format for store results
875                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
876 
877                 // Store blended normals
878                 DestNormAccessor::store(pDestNorm + 0, d0);
879                 DestNormAccessor::store(pDestNorm + 4, d1);
880                 DestNormAccessor::store(pDestNorm + 8, d2);
881 
882                 // Advance 4 vertices
883                 pSrcNorm += 4 * 3;
884                 pDestNorm += 4 * 3;
885             }
886         }
887     };
softwareVertexSkinning_SSE_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)888     static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
889         const float* pSrcPos, float* pDestPos,
890         const float* pSrcNorm, float* pDestNorm,
891         const float* pBlendWeight, const unsigned char* pBlendIndex,
892         const Matrix4* const* blendMatrices,
893         size_t blendWeightStride, size_t blendIndexStride,
894         size_t numWeightsPerVertex,
895         size_t numIterations)
896     {
897         assert(_isAlignedForSSE(pSrcPos));
898 
899         // Instantiating two version only, since other alignment combination not that important.
900         if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm))
901         {
902             SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(
903                 pSrcPos, pDestPos,
904                 pSrcNorm, pDestNorm,
905                 pBlendWeight, pBlendIndex,
906                 blendMatrices,
907                 blendWeightStride, blendIndexStride,
908                 numWeightsPerVertex,
909                 numIterations);
910         }
911         else
912         {
913             SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(
914                 pSrcPos, pDestPos,
915                 pSrcNorm, pDestNorm,
916                 pBlendWeight, pBlendIndex,
917                 blendMatrices,
918                 blendWeightStride, blendIndexStride,
919                 numWeightsPerVertex,
920                 numIterations);
921         }
922     }
923     //---------------------------------------------------------------------
924     // Special SSE version skinning position only, the position buffer are
925     // packed.
926     template <bool srcPosAligned, bool destPosAligned>
927     struct SoftwareVertexSkinning_SSE_PosOnly_Packed
928     {
applyOgre::SoftwareVertexSkinning_SSE_PosOnly_Packed929         static void apply(
930             const float* pSrcPos, float* pDestPos,
931             const float* pBlendWeight, const unsigned char* pBlendIndex,
932             const Matrix4* const* blendMatrices,
933             size_t blendWeightStride, size_t blendIndexStride,
934             size_t numWeightsPerVertex,
935             size_t numIterations)
936         {
937             typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
938             typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
939 
940             // Blending 4 vertices per-iteration
941             for (size_t i = 0; i < numIterations; ++i)
942             {
943                 // Collapse matrices
944                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
945                 _collapseFourMatrices(
946                     m00, m01, m02,
947                     m10, m11, m12,
948                     m20, m21, m22,
949                     m30, m31, m32,
950                     pBlendWeight, pBlendIndex,
951                     blendMatrices,
952                     blendWeightStride, blendIndexStride,
953                     numWeightsPerVertex);
954 
955                 // Advance 4 vertices
956                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
957                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
958 
959                 //------------------------------------------------------------------
960                 // Transform positions
961                 //------------------------------------------------------------------
962 
963                 __m128 s0, s1, s2, d0, d1, d2;
964 
965                 // Load source positions
966                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
967                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
968                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
969 
970                 // Arrange to 3x4 component-major for batches calculate
971                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
972 
973                 // Transform by collapsed matrix
974 
975                 // Shuffle row 0 of four collapsed matrices for calculate X component
976                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
977 
978                 // Transform X components
979                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
980 
981                 // Shuffle row 1 of four collapsed matrices for calculate Y component
982                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
983 
984                 // Transform Y components
985                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
986 
987                 // Shuffle row 2 of four collapsed matrices for calculate Z component
988                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
989 
990                 // Transform Z components
991                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
992 
993                 // Arrange back to 4x3 continuous format for store results
994                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
995 
996                 // Store blended positions
997                 DestPosAccessor::store(pDestPos + 0, d0);
998                 DestPosAccessor::store(pDestPos + 4, d1);
999                 DestPosAccessor::store(pDestPos + 8, d2);
1000 
1001                 // Advance 4 vertices
1002                 pSrcPos += 4 * 3;
1003                 pDestPos += 4 * 3;
1004             }
1005         }
1006     };
softwareVertexSkinning_SSE_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)1007     static FORCEINLINE void softwareVertexSkinning_SSE_PosOnly_Packed(
1008         const float* pSrcPos, float* pDestPos,
1009         const float* pBlendWeight, const unsigned char* pBlendIndex,
1010         const Matrix4* const* blendMatrices,
1011         size_t blendWeightStride, size_t blendIndexStride,
1012         size_t numWeightsPerVertex,
1013         size_t numIterations)
1014     {
1015         assert(_isAlignedForSSE(pSrcPos));
1016 
1017         // Instantiating two version only, since other alignment combination not that important.
1018         if (_isAlignedForSSE(pDestPos))
1019         {
1020             SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(
1021                 pSrcPos, pDestPos,
1022                 pBlendWeight, pBlendIndex,
1023                 blendMatrices,
1024                 blendWeightStride, blendIndexStride,
1025                 numWeightsPerVertex,
1026                 numIterations);
1027         }
1028         else
1029         {
1030             SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(
1031                 pSrcPos, pDestPos,
1032                 pBlendWeight, pBlendIndex,
1033                 blendMatrices,
1034                 blendWeightStride, blendIndexStride,
1035                 numWeightsPerVertex,
1036                 numIterations);
1037         }
1038     }
1039     //---------------------------------------------------------------------
1040     //---------------------------------------------------------------------
1041     //---------------------------------------------------------------------
OptimisedUtilSSE(void)1042     OptimisedUtilSSE::OptimisedUtilSSE(void)
1043         : mPreferGeneralVersionForSharedBuffers(false)
1044     {
1045         // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use
1046         // unrolled version for shared buffers at all, I guess because that version
1047         // run out of usable CPU registers, or L1/L2 cache related problem, causing
1048         // slight performance loss than general version.
1049         //
1050 
1051 		if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos)
1052         {
1053             // How can I check it's an Athlon XP but not Althon 64?
1054             // Ok, just test whether supports SSE2/SSE3 or not, if not,
1055             // assume general version faster than unrolled version :)
1056             //
1057             if (!(PlatformInformation::getCpuFeatures() &
1058                 (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3)))
1059             {
1060                 mPreferGeneralVersionForSharedBuffers = true;
1061             }
1062         }
1063     }
1064     //---------------------------------------------------------------------
softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1065     void OptimisedUtilSSE::softwareVertexSkinning(
1066         const float *pSrcPos, float *pDestPos,
1067         const float *pSrcNorm, float *pDestNorm,
1068         const float *pBlendWeight, const unsigned char* pBlendIndex,
1069         const Matrix4* const* blendMatrices,
1070         size_t srcPosStride, size_t destPosStride,
1071         size_t srcNormStride, size_t destNormStride,
1072         size_t blendWeightStride, size_t blendIndexStride,
1073         size_t numWeightsPerVertex,
1074         size_t numVertices)
1075     {
1076         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1077 
1078         // All position/normal pointers should be perfect aligned, but still check here
1079         // for avoid hardware buffer which allocated by potential buggy driver doesn't
1080         // support alignment properly.
1081         // Because we are used meta-function technique here, the code is easy to maintenance
1082         // and still provides all possible alignment combination.
1083         //
1084 
1085         // Use unrolled routines only if there a lot of vertices
1086         if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES)
1087         {
1088             if (pSrcNorm)
1089             {
1090                 // Blend position and normal
1091 
1092                 if (!mPreferGeneralVersionForSharedBuffers &&
1093                     srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1094                     pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1095                 {
1096                     // Position and normal are sharing with packed buffer
1097 
1098                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1099                     assert((srcPosAlign & 3) == 0);
1100 
1101                     // Blend unaligned vertices with general SIMD routine
1102                     if (srcPosAlign == 8)   // Because 8 bytes alignment shift per-vertex
1103                     {
1104                         size_t count = srcPosAlign / 8;
1105                         numVertices -= count;
1106                         softwareVertexSkinning_SSE_General(
1107                             pSrcPos, pDestPos,
1108                             pSrcNorm, pDestNorm,
1109                             pBlendWeight, pBlendIndex,
1110                             blendMatrices,
1111                             srcPosStride, destPosStride,
1112                             srcNormStride, destNormStride,
1113                             blendWeightStride, blendIndexStride,
1114                             numWeightsPerVertex,
1115                             count);
1116 
1117                         pSrcPos += count * (3 + 3);
1118                         pDestPos += count * (3 + 3);
1119                         pSrcNorm += count * (3 + 3);
1120                         pDestNorm += count * (3 + 3);
1121                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1122                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1123                     }
1124 
1125                     // Blend vertices, four vertices per-iteration
1126                     size_t numIterations = numVertices / 4;
1127                     softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
1128                         pSrcPos, pDestPos,
1129                         pBlendWeight, pBlendIndex,
1130                         blendMatrices,
1131                         blendWeightStride, blendIndexStride,
1132                         numWeightsPerVertex,
1133                         numIterations);
1134 
1135                     // Advance pointers for remaining vertices
1136                     numVertices &= 3;
1137                     if (numVertices)
1138                     {
1139                         pSrcPos += numIterations * 4 * (3 + 3);
1140                         pDestPos += numIterations * 4 * (3 + 3);
1141                         pSrcNorm += numIterations * 4 * (3 + 3);
1142                         pDestNorm += numIterations * 4 * (3 + 3);
1143                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1144                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1145                     }
1146                 }
1147                 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1148                          srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1149                 {
1150                     // Position and normal are separate buffers, and all of them are packed
1151 
1152                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1153                     assert((srcPosAlign & 3) == 0);
1154 
1155                     // Blend unaligned vertices with general SIMD routine
1156                     if (srcPosAlign)
1157                     {
1158                         size_t count = srcPosAlign / 4;
1159                         numVertices -= count;
1160                         softwareVertexSkinning_SSE_General(
1161                             pSrcPos, pDestPos,
1162                             pSrcNorm, pDestNorm,
1163                             pBlendWeight, pBlendIndex,
1164                             blendMatrices,
1165                             srcPosStride, destPosStride,
1166                             srcNormStride, destNormStride,
1167                             blendWeightStride, blendIndexStride,
1168                             numWeightsPerVertex,
1169                             count);
1170 
1171                         pSrcPos += count * 3;
1172                         pDestPos += count * 3;
1173                         pSrcNorm += count * 3;
1174                         pDestNorm += count * 3;
1175                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1176                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1177                     }
1178 
1179                     // Blend vertices, four vertices per-iteration
1180                     size_t numIterations = numVertices / 4;
1181                     softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
1182                         pSrcPos, pDestPos,
1183                         pSrcNorm, pDestNorm,
1184                         pBlendWeight, pBlendIndex,
1185                         blendMatrices,
1186                         blendWeightStride, blendIndexStride,
1187                         numWeightsPerVertex,
1188                         numIterations);
1189 
1190                     // Advance pointers for remaining vertices
1191                     numVertices &= 3;
1192                     if (numVertices)
1193                     {
1194                         pSrcPos += numIterations * 4 * 3;
1195                         pDestPos += numIterations * 4 * 3;
1196                         pSrcNorm += numIterations * 4 * 3;
1197                         pDestNorm += numIterations * 4 * 3;
1198                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1199                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1200                     }
1201                 }
1202                 else    // Not 'packed' form or wrong order between position and normal
1203                 {
1204                     // Should never occur, do nothing here just in case
1205                 }
1206             }
1207             else    // !pSrcNorm
1208             {
1209                 // Blend position only
1210 
1211                 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1212                 {
1213                     // All buffers are packed
1214 
1215                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1216                     assert((srcPosAlign & 3) == 0);
1217 
1218                     // Blend unaligned vertices with general SIMD routine
1219                     if (srcPosAlign)
1220                     {
1221                         size_t count = srcPosAlign / 4;
1222                         numVertices -= count;
1223                         softwareVertexSkinning_SSE_General(
1224                             pSrcPos, pDestPos,
1225                             pSrcNorm, pDestNorm,
1226                             pBlendWeight, pBlendIndex,
1227                             blendMatrices,
1228                             srcPosStride, destPosStride,
1229                             srcNormStride, destNormStride,
1230                             blendWeightStride, blendIndexStride,
1231                             numWeightsPerVertex,
1232                             count);
1233 
1234                         pSrcPos += count * 3;
1235                         pDestPos += count * 3;
1236                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1237                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1238                     }
1239 
1240                     // Blend vertices, four vertices per-iteration
1241                     size_t numIterations = numVertices / 4;
1242                     softwareVertexSkinning_SSE_PosOnly_Packed(
1243                         pSrcPos, pDestPos,
1244                         pBlendWeight, pBlendIndex,
1245                         blendMatrices,
1246                         blendWeightStride, blendIndexStride,
1247                         numWeightsPerVertex,
1248                         numIterations);
1249 
1250                     // Advance pointers for remaining vertices
1251                     numVertices &= 3;
1252                     if (numVertices)
1253                     {
1254                         pSrcPos += numIterations * 4 * 3;
1255                         pDestPos += numIterations * 4 * 3;
1256                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1257                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1258                     }
1259                 }
1260                 else    // Not 'packed' form
1261                 {
1262                     // Might occur only if user forced software blending position only
1263                 }
1264             }
1265         }
1266 
1267         // Blend remaining vertices, need to do it with SIMD for identical result,
1268         // since mixing general floating-point and SIMD algorithm will causing
1269         // floating-point error.
1270         if (numVertices)
1271         {
1272             softwareVertexSkinning_SSE_General(
1273                 pSrcPos, pDestPos,
1274                 pSrcNorm, pDestNorm,
1275                 pBlendWeight, pBlendIndex,
1276                 blendMatrices,
1277                 srcPosStride, destPosStride,
1278                 srcNormStride, destNormStride,
1279                 blendWeightStride, blendIndexStride,
1280                 numWeightsPerVertex,
1281                 numVertices);
1282         }
1283     }
1284     //---------------------------------------------------------------------
softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1285     void OptimisedUtilSSE::softwareVertexMorph(
1286         Real t,
1287         const float *pSrc1, const float *pSrc2,
1288         float *pDst,
1289 		size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
1290         size_t numVertices,
1291 		bool morphNormals)
1292     {
1293         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1294 
1295         __m128 src01, src02, src11, src12, src21, src22;
1296         __m128 dst0, dst1, dst2;
1297 
1298         __m128 t4 = _mm_load_ps1(&t);
1299 
1300 
1301 		// If we're morphing normals, we have twice the number of floats to process
1302 		// Positions are interleaved with normals, so we'll have to separately
1303 		// normalise just the normals later; we'll just lerp in the first pass
1304 		// We can't normalise as we go because normals & positions are only 3 floats
1305 		// each so are not aligned for SSE, we'd mix the data up
1306 		size_t normalsMultiplier = morphNormals ? 2 : 1;
1307         size_t numIterations = (numVertices*normalsMultiplier) / 4;
1308 		size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3;
1309 
1310 		// Save for later
1311 		float *pStartDst = pDst;
1312 
1313         // Never use meta-function technique to accessing memory because looks like
1314         // VC7.1 generate a bit inefficient binary code when put following code into
1315         // inline function.
1316 
1317         if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst))
1318         {
1319             // All data aligned
1320 
1321             // Morph 4 vertices per-iteration. Special designed for use all
1322             // available CPU registers as possible (7 registers used here),
1323             // and avoid temporary values allocated in stack for suppress
1324             // extra memory access.
1325             for (size_t i = 0; i < numIterations; ++i)
1326             {
1327                 // 12 floating-point values
1328                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1329                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1330                 src11 = __MM_LOAD_PS(pSrc1 + 4);
1331                 src12 = __MM_LOAD_PS(pSrc2 + 4);
1332                 src21 = __MM_LOAD_PS(pSrc1 + 8);
1333                 src22 = __MM_LOAD_PS(pSrc2 + 8);
1334                 pSrc1 += 12; pSrc2 += 12;
1335 
1336                 dst0 = __MM_LERP_PS(t4, src01, src02);
1337                 dst1 = __MM_LERP_PS(t4, src11, src12);
1338                 dst2 = __MM_LERP_PS(t4, src21, src22);
1339 
1340                 __MM_STORE_PS(pDst + 0, dst0);
1341                 __MM_STORE_PS(pDst + 4, dst1);
1342                 __MM_STORE_PS(pDst + 8, dst2);
1343                 pDst += 12;
1344             }
1345 
1346             // Morph remaining vertices
1347             switch (numVerticesRemainder)
1348             {
1349             case 3:
1350                 // 9 floating-point values
1351                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1352                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1353                 src11 = __MM_LOAD_PS(pSrc1 + 4);
1354                 src12 = __MM_LOAD_PS(pSrc2 + 4);
1355                 src21 = _mm_load_ss(pSrc1 + 8);
1356                 src22 = _mm_load_ss(pSrc2 + 8);
1357 
1358                 dst0 = __MM_LERP_PS(t4, src01, src02);
1359                 dst1 = __MM_LERP_PS(t4, src11, src12);
1360                 dst2 = __MM_LERP_SS(t4, src21, src22);
1361 
1362                 __MM_STORE_PS(pDst + 0, dst0);
1363                 __MM_STORE_PS(pDst + 4, dst1);
1364                 _mm_store_ss(pDst + 8, dst2);
1365                 break;
1366 
1367             case 2:
1368                 // 6 floating-point values
1369                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1370                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1371                 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1372                 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1373 
1374                 dst0 = __MM_LERP_PS(t4, src01, src02);
1375                 dst1 = __MM_LERP_PS(t4, src11, src12);
1376 
1377                 __MM_STORE_PS(pDst + 0, dst0);
1378                 _mm_storel_pi((__m64*)(pDst + 4), dst1);
1379                 break;
1380 
1381             case 1:
1382                 // 3 floating-point values
1383                 src01 = _mm_load_ss(pSrc1 + 2);
1384                 src02 = _mm_load_ss(pSrc2 + 2);
1385                 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1386                 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1387 
1388                 dst0 = __MM_LERP_PS(t4, src01, src02);
1389 
1390                 _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1391                 _mm_store_ss(pDst + 2, dst0);
1392                 break;
1393             }
1394         }
1395         else    // Should never occur, just in case of buggy drivers
1396         {
1397             // Assume all data unaligned
1398 
1399             // Morph 4 vertices per-iteration. Special designed for use all
1400             // available CPU registers as possible (7 registers used here),
1401             // and avoid temporary values allocated in stack for suppress
1402             // extra memory access.
1403             for (size_t i = 0; i < numIterations; ++i)
1404             {
1405                 // 12 floating-point values
1406                 src01 = _mm_loadu_ps(pSrc1 + 0);
1407                 src02 = _mm_loadu_ps(pSrc2 + 0);
1408                 src11 = _mm_loadu_ps(pSrc1 + 4);
1409                 src12 = _mm_loadu_ps(pSrc2 + 4);
1410                 src21 = _mm_loadu_ps(pSrc1 + 8);
1411                 src22 = _mm_loadu_ps(pSrc2 + 8);
1412                 pSrc1 += 12; pSrc2 += 12;
1413 
1414                 dst0 = __MM_LERP_PS(t4, src01, src02);
1415                 dst1 = __MM_LERP_PS(t4, src11, src12);
1416                 dst2 = __MM_LERP_PS(t4, src21, src22);
1417 
1418                 _mm_storeu_ps(pDst + 0, dst0);
1419                 _mm_storeu_ps(pDst + 4, dst1);
1420                 _mm_storeu_ps(pDst + 8, dst2);
1421                 pDst += 12;
1422 
1423             }
1424 
1425             // Morph remaining vertices
1426             switch (numVerticesRemainder)
1427             {
1428             case 3:
1429                 // 9 floating-point values
1430                 src01 = _mm_loadu_ps(pSrc1 + 0);
1431                 src02 = _mm_loadu_ps(pSrc2 + 0);
1432                 src11 = _mm_loadu_ps(pSrc1 + 4);
1433                 src12 = _mm_loadu_ps(pSrc2 + 4);
1434                 src21 = _mm_load_ss(pSrc1 + 8);
1435                 src22 = _mm_load_ss(pSrc2 + 8);
1436 
1437                 dst0 = __MM_LERP_PS(t4, src01, src02);
1438                 dst1 = __MM_LERP_PS(t4, src11, src12);
1439                 dst2 = __MM_LERP_SS(t4, src21, src22);
1440 
1441                 _mm_storeu_ps(pDst + 0, dst0);
1442                 _mm_storeu_ps(pDst + 4, dst1);
1443                 _mm_store_ss(pDst + 8, dst2);
1444                 break;
1445 
1446             case 2:
1447                 // 6 floating-point values
1448                 src01 = _mm_loadu_ps(pSrc1 + 0);
1449                 src02 = _mm_loadu_ps(pSrc2 + 0);
1450                 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1451                 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1452 
1453                 dst0 = __MM_LERP_PS(t4, src01, src02);
1454                 dst1 = __MM_LERP_PS(t4, src11, src12);
1455 
1456                 _mm_storeu_ps(pDst + 0, dst0);
1457                 _mm_storel_pi((__m64*)(pDst + 4), dst1);
1458                 break;
1459 
1460             case 1:
1461                 // 3 floating-point values
1462                 src01 = _mm_load_ss(pSrc1 + 2);
1463                 src02 = _mm_load_ss(pSrc2 + 2);
1464                 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1465                 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1466 
1467                 dst0 = __MM_LERP_PS(t4, src01, src02);
1468 
1469                 _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1470                 _mm_store_ss(pDst + 2, dst0);
1471                 break;
1472             }
1473 
1474         }
1475 
1476 		if (morphNormals)
1477 		{
1478 
1479 			// Now we need to do and unaligned normalise on the normals data we just
1480 			// lerped; because normals are 3 elements each they're always unaligned
1481 			float *pNorm = pStartDst;
1482 
1483 			// Offset past first position
1484 			pNorm += 3;
1485 
1486 			// We'll do one normal each iteration, but still use SSE
1487 			for (size_t n = 0; n < numVertices; ++n)
1488 			{
1489 				// normalise function
1490 				__m128 norm;
1491 
1492 				// load 3 floating-point normal values
1493 				// This loads into [0] and clears the rest
1494                 norm = _mm_load_ss(pNorm + 2);
1495 				// This loads into [2,3]. [1] is unused
1496                 norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0));
1497 
1498 				// Fill a 4-vec with vector length
1499 				// square
1500 				__m128 tmp = _mm_mul_ps(norm, norm);
1501 				// Add - for this we want this effect:
1502 				// orig   3 | 2 | 1 | 0
1503 				// add1   0 | 0 | 0 | 2
1504 				// add2   2 | 3 | 0 | 3
1505 				// This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused)
1506 
1507 				tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2)));
1508 				// Add final combination & sqrt
1509 				// bottom 3 elements of l will have length, we don't care about 4
1510 				tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3)));
1511 				// Then divide to normalise
1512 				norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp));
1513 
1514 				// Store back in the same place
1515 				_mm_storeh_pi((__m64*)(pNorm + 0), norm);
1516                 _mm_store_ss(pNorm + 2, norm);
1517 
1518 				// Skip to next vertex (3x normal components, 3x position components)
1519 				pNorm += 6;
1520 
1521 
1522 			}
1523 
1524 
1525 		}
1526     }
1527     //---------------------------------------------------------------------
concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * pSrcMat,Matrix4 * pDstMat,size_t numMatrices)1528     void OptimisedUtilSSE::concatenateAffineMatrices(
1529         const Matrix4& baseMatrix,
1530         const Matrix4* pSrcMat,
1531         Matrix4* pDstMat,
1532         size_t numMatrices)
1533     {
1534         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1535 
1536         assert(_isAlignedForSSE(pSrcMat));
1537         assert(_isAlignedForSSE(pDstMat));
1538 
1539         // Load base matrix, unaligned
1540         __m128 m0 = _mm_loadu_ps(baseMatrix[0]);
1541         __m128 m1 = _mm_loadu_ps(baseMatrix[1]);
1542         __m128 m2 = _mm_loadu_ps(baseMatrix[2]);
1543         __m128 m3 = _mm_loadu_ps(baseMatrix[3]);        // m3 should be equal to (0, 0, 0, 1)
1544 
1545         for (size_t i = 0; i < numMatrices; ++i)
1546         {
1547             // Load source matrix, aligned
1548             __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]);
1549             __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]);
1550             __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]);
1551 
1552             ++pSrcMat;
1553 
1554             __m128 t0, t1, t2, t3;
1555 
1556             // Concatenate matrix, and store results
1557 
1558             // Row 0
1559             t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0);
1560             t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1);
1561             t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2);
1562             t3 = _mm_mul_ps(m0, m3);    // Compiler should optimise this out of the loop
1563             __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3));
1564 
1565             // Row 1
1566             t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0);
1567             t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1);
1568             t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2);
1569             t3 = _mm_mul_ps(m1, m3);    // Compiler should optimise this out of the loop
1570             __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3));
1571 
1572             // Row 2
1573             t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0);
1574             t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1);
1575             t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2);
1576             t3 = _mm_mul_ps(m2, m3);    // Compiler should optimise this out of the loop
1577             __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3));
1578 
1579             // Row 3
1580             __MM_STORE_PS((*pDstMat)[3], m3);
1581 
1582             ++pDstMat;
1583         }
1584     }
1585     //---------------------------------------------------------------------
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1586     void OptimisedUtilSSE::calculateFaceNormals(
1587         const float *positions,
1588         const EdgeData::Triangle *triangles,
1589         Vector4 *faceNormals,
1590         size_t numTriangles)
1591     {
1592         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1593 
1594         assert(_isAlignedForSSE(faceNormals));
1595 
1596 // Load Vector3 as: (x, 0, y, z)
1597 #define __LOAD_VECTOR3(p)   _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1))
1598 
1599         // Mask used to changes sign of single precision floating point values.
1600         OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) =
1601         {
1602             0x80000000, 0x80000000, 0x80000000, 0x80000000,
1603         };
1604 
1605         size_t numIterations = numTriangles / 4;
1606         numTriangles &= 3;
1607 
1608         // Four triangles per-iteration
1609         for (size_t i = 0; i < numIterations; ++i)
1610         {
1611 
1612 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1613 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3)                    \
1614             {                                                           \
1615                 __m128 v0 = __LOAD_VECTOR3(p0);     /* x0 -- y0 z0 */   \
1616                 __m128 v1 = __LOAD_VECTOR3(p1);     /* x1 -- y1 z1 */   \
1617                 __m128 v2 = __LOAD_VECTOR3(p2);     /* x2 -- y2 z2 */   \
1618                 __m128 v3 = __LOAD_VECTOR3(p3);     /* x3 -- y3 z3 */   \
1619                 __m128 t0, t1;                                          \
1620                                                                         \
1621                 t0 = _mm_unpacklo_ps(v0, v2);       /* x0 x2 -- -- */   \
1622                 t1 = _mm_unpacklo_ps(v1, v3);       /* x1 x3 -- -- */   \
1623                 x  = _mm_unpacklo_ps(t0, t1);       /* x0 x1 x2 x3 */   \
1624                                                                         \
1625                 t0 = _mm_unpackhi_ps(v0, v2);       /* y0 y2 z0 z2 */   \
1626                 t1 = _mm_unpackhi_ps(v1, v3);       /* y1 y3 z1 z3 */   \
1627                 y  = _mm_unpacklo_ps(t0, t1);       /* y0 y1 y2 y3 */   \
1628                 z  = _mm_unpackhi_ps(t0, t1);       /* z0 z1 z2 z3 */   \
1629             }
1630 
1631             __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2;
1632 
1633             // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1634             __LOAD_FOUR_VECTOR3(x0, y0, z0,
1635                 positions + triangles[0].vertIndex[0] * 3,
1636                 positions + triangles[1].vertIndex[0] * 3,
1637                 positions + triangles[2].vertIndex[0] * 3,
1638                 positions + triangles[3].vertIndex[0] * 3);
1639 
1640             // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1641             __LOAD_FOUR_VECTOR3(x1, y1, z1,
1642                 positions + triangles[0].vertIndex[1] * 3,
1643                 positions + triangles[1].vertIndex[1] * 3,
1644                 positions + triangles[2].vertIndex[1] * 3,
1645                 positions + triangles[3].vertIndex[1] * 3);
1646 
1647             // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1648             __LOAD_FOUR_VECTOR3(x2, y2, z2,
1649                 positions + triangles[0].vertIndex[2] * 3,
1650                 positions + triangles[1].vertIndex[2] * 3,
1651                 positions + triangles[2].vertIndex[2] * 3,
1652                 positions + triangles[3].vertIndex[2] * 3);
1653 
1654             triangles += 4;
1655 
1656             // Calculate triangle face normals
1657 
1658             // a = v1 - v0
1659             __m128 ax = _mm_sub_ps(x1, x0);
1660             __m128 ay = _mm_sub_ps(y1, y0);
1661             __m128 az = _mm_sub_ps(z1, z0);
1662 
1663             // b = v2 - v0
1664             __m128 bx = _mm_sub_ps(x2, x0);
1665             __m128 by = _mm_sub_ps(y2, y0);
1666             __m128 bz = _mm_sub_ps(z2, z0);
1667 
1668             // n = a cross b
1669             __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by));
1670             __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz));
1671             __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx));
1672 
1673             // w = - (n dot v0)
1674             __m128 nw = _mm_xor_ps(
1675                 __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0),
1676                 *(const __m128 *)&msSignMask);
1677 
1678             // Arrange to per-triangle face normal major format
1679             __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1680 
1681             // Store results
1682             __MM_STORE_PS(&faceNormals[0].x, nx);
1683             __MM_STORE_PS(&faceNormals[1].x, ny);
1684             __MM_STORE_PS(&faceNormals[2].x, nz);
1685             __MM_STORE_PS(&faceNormals[3].x, nw);
1686             faceNormals += 4;
1687 
1688 #undef __LOAD_FOUR_VECTOR3
1689         }
1690 
1691         // Dealing with remaining triangles
1692         for (size_t j = 0; j < numTriangles; ++j)
1693         {
1694             // Load vertices of the triangle
1695             __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3);
1696             __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3);
1697             __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3);
1698             ++triangles;
1699 
1700             // Calculate face normal
1701 
1702             __m128 t0, t1;
1703 
1704             __m128 a = _mm_sub_ps(v1, v0);                      // ax 0 ay az
1705             __m128 b = _mm_sub_ps(v2, v0);                      // bx 0 by bz
1706             t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3));    // az 0 ax ay
1707             t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3));    // bz 0 bx by
1708             t0 = _mm_mul_ps(t0, b);                             // az*bx 0 ax*by ay*bz
1709             t1 = _mm_mul_ps(t1, a);                             // ax*bz 0 ay*bx az*by
1710 
1711             __m128 n = _mm_sub_ps(t0, t1);                      // ny 0  nz nx
1712 
1713             __m128 d = _mm_mul_ps(                              // dy 0  dz dx
1714                 _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n);
1715 
1716             n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(               // nx ny nz -(dx+dy+dz)
1717                 _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)),     // nx ny nz 0
1718                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))),    // 0  0  0  dx
1719                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))),    // 0  0  0  dy
1720                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1)));    // 0  0  0  dz
1721 
1722             // Store result
1723             __MM_STORE_PS(&faceNormals->x, n);
1724             ++faceNormals;
1725         }
1726 
1727 #undef __LOAD_VECTOR3
1728     }
1729     //---------------------------------------------------------------------
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1730     void OptimisedUtilSSE::calculateLightFacing(
1731         const Vector4& lightPos,
1732         const Vector4* faceNormals,
1733         char* lightFacings,
1734         size_t numFaces)
1735     {
1736         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1737 
1738         assert(_isAlignedForSSE(faceNormals));
1739 
1740         // Map to convert 4-bits mask to 4 byte values
1741         static const char msMaskMapping[16][4] =
1742         {
1743             {0, 0, 0, 0},   {1, 0, 0, 0},   {0, 1, 0, 0},   {1, 1, 0, 0},
1744             {0, 0, 1, 0},   {1, 0, 1, 0},   {0, 1, 1, 0},   {1, 1, 1, 0},
1745             {0, 0, 0, 1},   {1, 0, 0, 1},   {0, 1, 0, 1},   {1, 1, 0, 1},
1746             {0, 0, 1, 1},   {1, 0, 1, 1},   {0, 1, 1, 1},   {1, 1, 1, 1},
1747         };
1748 
1749         __m128 n0, n1, n2, n3;
1750         __m128 t0, t1;
1751         __m128 dp;
1752         int bitmask;
1753 
1754         // Load light vector, unaligned
1755         __m128 lp = _mm_loadu_ps(&lightPos.x);
1756 
1757         // Perload zero to register for compare dot product values
1758         __m128 zero = _mm_setzero_ps();
1759 
1760         size_t numIterations = numFaces / 4;
1761         numFaces &= 3;
1762 
1763         // Four faces per-iteration
1764         for (size_t i = 0; i < numIterations; ++i)
1765         {
1766             // Load face normals, aligned
1767             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1768             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1769             n2 = __MM_LOAD_PS(&faceNormals[2].x);
1770             n3 = __MM_LOAD_PS(&faceNormals[3].x);
1771             faceNormals += 4;
1772 
1773             // Multiply by light vector
1774             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1775             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1776             n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1777             n3 = _mm_mul_ps(n3, lp);        // x3 y3 z3 w3
1778 
1779             // Horizontal add four vector values.
1780             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1781                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1782                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1783             t1 = _mm_add_ps(                                            // x2+z2 x3+z3 y2+w2 y3+w3
1784                 _mm_unpacklo_ps(n2, n3),    // x2 x3 y2 y3
1785                 _mm_unpackhi_ps(n2, n3));   // z2 z3 w2 w3
1786             dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp3
1787                 _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x3+z3
1788                 _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y3+w3
1789 
1790             // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps'
1791             // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch,
1792             // i.e. it's 2nd operand of the assembly instruction. And in fact
1793             // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped
1794             // in VC7.1.
1795             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1796 
1797             // Convert 4-bits mask to 4 bytes, and store results.
1798             /*
1799             *reinterpret_cast<uint32*>(lightFacings) =
1800                 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]);
1801                 */
1802             memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32));
1803 
1804 
1805             lightFacings += 4;
1806         }
1807 
1808         // Dealing with remaining faces
1809         switch (numFaces)
1810         {
1811         case 3:
1812             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1813             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1814             n2 = __MM_LOAD_PS(&faceNormals[2].x);
1815 
1816             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1817             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1818             n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1819 
1820             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1821                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1822                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1823             t1 = _mm_add_ps(                                            // x2+z2 x2+z2 y2+w2 y2+w2
1824                 _mm_unpacklo_ps(n2, n2),    // x2 x2 y2 y2
1825                 _mm_unpackhi_ps(n2, n2));   // z2 z2 w2 w2
1826             dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp2
1827                 _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x2+z2
1828                 _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y2+w2
1829 
1830             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1831 
1832             lightFacings[0] = msMaskMapping[bitmask][0];
1833             lightFacings[1] = msMaskMapping[bitmask][1];
1834             lightFacings[2] = msMaskMapping[bitmask][2];
1835             break;
1836 
1837         case 2:
1838             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1839             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1840 
1841             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1842             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1843 
1844             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1845                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1846                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1847             dp = _mm_add_ps(                                            // dp0 dp1 dp0 dp1
1848                 _mm_movelh_ps(t0, t0),      // x0+z0 x1+z1 x0+z0 x1+z1
1849                 _mm_movehl_ps(t0, t0));     // y0+w0 y1+w1 y0+w0 y1+w1
1850 
1851             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1852 
1853             lightFacings[0] = msMaskMapping[bitmask][0];
1854             lightFacings[1] = msMaskMapping[bitmask][1];
1855             break;
1856 
1857         case 1:
1858             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1859 
1860             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1861 
1862             t0 = _mm_add_ps(                                            // x0+z0 x0+z0 y0+w0 y0+w0
1863                 _mm_unpacklo_ps(n0, n0),    // x0 x0 y0 y0
1864                 _mm_unpackhi_ps(n0, n0));   // z0 z0 w0 w0
1865             dp = _mm_add_ps(                                            // dp0 dp0 dp0 dp0
1866                 _mm_movelh_ps(t0, t0),      // x0+z0 x0+z0 x0+z0 x0+z0
1867                 _mm_movehl_ps(t0, t0));     // y0+w0 y0+w0 y0+w0 y0+w0
1868 
1869             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1870 
1871             lightFacings[0] = msMaskMapping[bitmask][0];
1872             break;
1873         }
1874     }
1875     //---------------------------------------------------------------------
1876     // Template to extrude vertices for directional light.
1877     template <bool srcAligned, bool destAligned>
1878     struct ExtrudeVertices_SSE_DirectionalLight
1879     {
applyOgre::ExtrudeVertices_SSE_DirectionalLight1880         static void apply(
1881             const Vector4& lightPos,
1882             Real extrudeDist,
1883             const float* pSrcPos,
1884             float* pDestPos,
1885             size_t numVertices)
1886         {
1887             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1888             typedef SSEMemoryAccessor<destAligned> DestAccessor;
1889 
1890             // Directional light, extrusion is along light direction
1891 
1892             // Load light vector, unaligned
1893             __m128 lp = _mm_loadu_ps(&lightPos.x);
1894 
1895             // Calculate extrusion direction, note that we use inverted direction here
1896             // for eliminate an extra negative instruction, we'll compensate for that
1897             // by use subtract instruction instead later.
1898             __m128 tmp = _mm_mul_ps(lp, lp);
1899             tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp));
1900             // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
1901             tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist));
1902             __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0));               // X Y Z -
1903 
1904             // Prepare extrude direction for extruding 4 vertices parallelly
1905             __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0));   // X Y Z X
1906             __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1));   // Y Z X Y
1907             __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2));   // Z X Y Z
1908 
1909             __m128 s0, s1, s2;
1910             __m128 d0, d1, d2;
1911 
1912             size_t numIterations = numVertices / 4;
1913             numVertices &= 3;
1914 
1915             // Extruding 4 vertices per-iteration
1916             for (size_t i = 0; i < numIterations; ++i)
1917             {
1918                 s0 = SrcAccessor::load(pSrcPos + 0);
1919                 s1 = SrcAccessor::load(pSrcPos + 4);
1920                 s2 = SrcAccessor::load(pSrcPos + 8);
1921                 pSrcPos += 12;
1922 
1923                 // The extrusion direction is inverted, use subtract instruction here
1924                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1925                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1926                 d2 = _mm_sub_ps(s2, dir2);                      // Z2 X3 Y3 Z3
1927 
1928                 DestAccessor::store(pDestPos + 0, d0);
1929                 DestAccessor::store(pDestPos + 4, d1);
1930                 DestAccessor::store(pDestPos + 8, d2);
1931                 pDestPos += 12;
1932             }
1933 
1934             // Dealing with remaining vertices
1935             switch (numVertices)
1936             {
1937             case 3:
1938                 // 9 floating-point values
1939                 s0 = SrcAccessor::load(pSrcPos + 0);
1940                 s1 = SrcAccessor::load(pSrcPos + 4);
1941                 s2 = _mm_load_ss(pSrcPos + 8);
1942 
1943                 // The extrusion direction is inverted, use subtract instruction here
1944                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1945                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1946                 d2 = _mm_sub_ss(s2, dir2);                      // Z2 -- -- --
1947 
1948                 DestAccessor::store(pDestPos + 0, d0);
1949                 DestAccessor::store(pDestPos + 4, d1);
1950                 _mm_store_ss(pDestPos + 8, d2);
1951                 break;
1952 
1953             case 2:
1954                 // 6 floating-point values
1955                 s0 = SrcAccessor::load(pSrcPos + 0);
1956                 s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here
1957 
1958                 // The extrusion direction is inverted, use subtract instruction here
1959                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1960                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 -- --
1961 
1962                 DestAccessor::store(pDestPos + 0, d0);
1963                 _mm_storel_pi((__m64*)(pDestPos + 4), d1);
1964                 break;
1965 
1966             case 1:
1967                 // 3 floating-point values
1968                 s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here
1969                 s1 = _mm_load_ss(pSrcPos + 2);
1970 
1971                 // The extrusion direction is inverted, use subtract instruction here
1972                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 -- --
1973                 d1 = _mm_sub_ss(s1, dir2);                      // Z0 -- -- --
1974 
1975                 _mm_storel_pi((__m64*)(pDestPos + 0), d0);
1976                 _mm_store_ss(pDestPos + 2, d1);
1977                 break;
1978             }
1979         }
1980     };
1981     //---------------------------------------------------------------------
1982     // Template to extrude vertices for point light.
1983     template <bool srcAligned, bool destAligned>
1984     struct ExtrudeVertices_SSE_PointLight
1985     {
applyOgre::ExtrudeVertices_SSE_PointLight1986         static void apply(
1987             const Vector4& lightPos,
1988             Real extrudeDist,
1989             const float* pSrcPos,
1990             float* pDestPos,
1991             size_t numVertices)
1992         {
1993             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1994             typedef SSEMemoryAccessor<destAligned> DestAccessor;
1995 
1996             // Point light, will calculate extrusion direction for every vertex
1997 
1998             // Load light vector, unaligned
1999             __m128 lp = _mm_loadu_ps(&lightPos.x);
2000 
2001             // Load extrude distance
2002             __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist);
2003 
2004             size_t numIterations = numVertices / 4;
2005             numVertices &= 3;
2006 
2007             // Extruding 4 vertices per-iteration
2008             for (size_t i = 0; i < numIterations; ++i)
2009             {
2010                 // Load source positions
2011                 __m128 s0 = SrcAccessor::load(pSrcPos + 0);     // x0 y0 z0 x1
2012                 __m128 s1 = SrcAccessor::load(pSrcPos + 4);     // y1 z1 x2 y2
2013                 __m128 s2 = SrcAccessor::load(pSrcPos + 8);     // z2 x3 y3 z3
2014                 pSrcPos += 12;
2015 
2016                 // Arrange to 3x4 component-major for batches calculate
2017                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
2018 
2019                 // Calculate unnormalised extrusion direction
2020                 __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3
2021                 __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3
2022                 __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3
2023 
2024                 // Normalise extrusion direction and multiply by extrude distance
2025                 __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
2026                 tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4);
2027                 dx = _mm_mul_ps(dx, tmp);
2028                 dy = _mm_mul_ps(dy, tmp);
2029                 dz = _mm_mul_ps(dz, tmp);
2030 
2031                 // Calculate extruded positions
2032                 __m128 d0 = _mm_add_ps(dx, s0);
2033                 __m128 d1 = _mm_add_ps(dy, s1);
2034                 __m128 d2 = _mm_add_ps(dz, s2);
2035 
2036                 // Arrange back to 4x3 continuous format for store results
2037                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
2038 
2039                 // Store extruded positions
2040                 DestAccessor::store(pDestPos + 0, d0);
2041                 DestAccessor::store(pDestPos + 4, d1);
2042                 DestAccessor::store(pDestPos + 8, d2);
2043                 pDestPos += 12;
2044             }
2045 
2046             // Dealing with remaining vertices
2047             for (size_t j = 0; j  < numVertices; ++j)
2048             {
2049                 // Load source position
2050                 __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z
2051                 pSrcPos += 3;
2052 
2053                 // Calculate unnormalised extrusion direction
2054                 __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z
2055 
2056                 // Normalise extrusion direction and multiply by extrude distance
2057                 __m128 tmp = _mm_mul_ps(dir, dir);
2058                 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3));
2059                 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
2060                 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4);
2061                 dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0));
2062 
2063                 // Calculate extruded position
2064                 __m128 dst = _mm_add_ps(dir, src);
2065 
2066                 // Store extruded position
2067                 _mm_store_ss(pDestPos + 0, dst);
2068                 _mm_storeh_pi((__m64*)(pDestPos + 1), dst);
2069                 pDestPos += 3;
2070             }
2071         }
2072     };
2073     //---------------------------------------------------------------------
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)2074     void OptimisedUtilSSE::extrudeVertices(
2075         const Vector4& lightPos,
2076         Real extrudeDist,
2077         const float* pSrcPos,
2078         float* pDestPos,
2079         size_t numVertices)
2080     {
2081         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
2082 
2083         // Note: Since pDestPos is following tail of pSrcPos, we can't assume
2084         // it's aligned to SIMD alignment properly, so must check for it here.
2085         //
2086         // TODO: Add extra vertex to the vertex buffer for make sure pDestPos
2087         // aligned same as pSrcPos.
2088         //
2089 
2090         // We are use SSE reciprocal square root directly while calculating
2091         // extrusion direction, since precision loss not that important here.
2092         //
2093         if (lightPos.w == 0.0f)
2094         {
2095             if (_isAlignedForSSE(pSrcPos))
2096             {
2097                 if (_isAlignedForSSE(pDestPos))
2098                     ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(
2099                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2100                 else
2101                     ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(
2102                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2103             }
2104             else
2105             {
2106                 if (_isAlignedForSSE(pDestPos))
2107                     ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(
2108                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2109                 else
2110                     ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(
2111                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2112             }
2113         }
2114         else
2115         {
2116             assert(lightPos.w == 1.0f);
2117 
2118             if (_isAlignedForSSE(pSrcPos))
2119             {
2120                 if (_isAlignedForSSE(pDestPos))
2121                     ExtrudeVertices_SSE_PointLight<true, true>::apply(
2122                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2123                 else
2124                     ExtrudeVertices_SSE_PointLight<true, false>::apply(
2125                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2126             }
2127             else
2128             {
2129                 if (_isAlignedForSSE(pDestPos))
2130                     ExtrudeVertices_SSE_PointLight<false, true>::apply(
2131                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2132                 else
2133                     ExtrudeVertices_SSE_PointLight<false, false>::apply(
2134                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2135             }
2136         }
2137     }
2138     //---------------------------------------------------------------------
2139     //---------------------------------------------------------------------
2140     //---------------------------------------------------------------------
_getOptimisedUtilSSE(void)2141     extern OptimisedUtil* _getOptimisedUtilSSE(void)
2142     {
2143         static OptimisedUtilSSE msOptimisedUtilSSE;
2144 #if defined(__OGRE_SIMD_ALIGN_STACK)
2145         static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE);
2146         return &msOptimisedUtilWithStackAlign;
2147 #else
2148         return &msOptimisedUtilSSE;
2149 #endif
2150     }
2151 
2152 }
2153 
2154 #endif // __OGRE_HAVE_SSE
2155