1 /*
2 -----------------------------------------------------------------------------
3 This source file is part of OGRE
4     (Object-oriented Graphics Rendering Engine)
5 For the latest info, see http://www.ogre3d.org/
6 
7 Copyright (c) 2000-2014 Torus Knot Software Ltd
8 
9 Permission is hereby granted, free of charge, to any person obtaining a copy
10 of this software and associated documentation files (the "Software"), to deal
11 in the Software without restriction, including without limitation the rights
12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
13 copies of the Software, and to permit persons to whom the Software is
14 furnished to do so, subject to the following conditions:
15 
16 The above copyright notice and this permission notice shall be included in
17 all copies or substantial portions of the Software.
18 
19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
25 THE SOFTWARE.
26 -----------------------------------------------------------------------------
27 */
28 #include "OgreStableHeaders.h"
29 #include "OgreOptimisedUtil.h"
30 
31 
32 #if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON
33 
34 // Should keep this includes at latest to avoid potential "xmmintrin.h" included by
35 // other header file on some platform for some reason.
36 #include "OgreSIMDHelper.h"
37 
38 // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's
39 // impossible when compile with gcc, due SSE instructions can only
40 // enable/disable at file level.
41 
42 //-------------------------------------------------------------------------
43 //
44 // The routines implemented in this file are performance oriented,
45 // which means saving every penny as possible. This requirement might
46 // break some C++/STL-rules.
47 //
48 //
49 // Some rules I'd like to respects:
50 //
51 // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because
52 //    it can saving one byte of binary code :)
53 // 2. Use add/sub instead of mul.
54 // 3. Eliminate prolog code of function call.
55 //
56 // The last, anything recommended by Intel Optimization Reference Manual.
57 //
58 //-------------------------------------------------------------------------
59 
60 // Use unrolled SSE version when vertices exceeds this limit
61 #define OGRE_SSE_SKINNING_UNROLL_VERTICES  16
62 
63 namespace Ogre {
64 
65 //-------------------------------------------------------------------------
66 // Local classes
67 //-------------------------------------------------------------------------
68 
69     /** SSE implementation of OptimisedUtil.
70     @note
71         Don't use this class directly, use OptimisedUtil instead.
72     */
73     class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil
74     {
75     protected:
76         /// Do we prefer to use a general SSE version for position/normal shared buffers?
77         bool mPreferGeneralVersionForSharedBuffers;
78 
79     public:
80         /// Constructor
81         OptimisedUtilSSE(void);
82 
83         /// @copydoc OptimisedUtil::softwareVertexSkinning
84         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning(
85             const float *srcPosPtr, float *destPosPtr,
86             const float *srcNormPtr, float *destNormPtr,
87             const float *blendWeightPtr, const unsigned char* blendIndexPtr,
88             const Affine3* const* blendMatrices,
89             size_t srcPosStride, size_t destPosStride,
90             size_t srcNormStride, size_t destNormStride,
91             size_t blendWeightStride, size_t blendIndexStride,
92             size_t numWeightsPerVertex,
93             size_t numVertices);
94 
95         /// @copydoc OptimisedUtil::softwareVertexMorph
96         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph(
97             Real t,
98             const float *srcPos1, const float *srcPos2,
99             float *dstPos,
100             size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
101             size_t numVertices,
102             bool morphNormals);
103 
104         /// @copydoc OptimisedUtil::concatenateAffineMatrices
105         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices(
106             const Affine3& baseMatrix,
107             const Affine3* srcMatrices,
108             Affine3* dstMatrices,
109             size_t numMatrices);
110 
111         /// @copydoc OptimisedUtil::calculateFaceNormals
112         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals(
113             const float *positions,
114             const EdgeData::Triangle *triangles,
115             Vector4 *faceNormals,
116             size_t numTriangles);
117 
118         /// @copydoc OptimisedUtil::calculateLightFacing
119         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing(
120             const Vector4& lightPos,
121             const Vector4* faceNormals,
122             char* lightFacings,
123             size_t numFaces);
124 
125         /// @copydoc OptimisedUtil::extrudeVertices
126         virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices(
127             const Vector4& lightPos,
128             Real extrudeDist,
129             const float* srcPositions,
130             float* destPositions,
131             size_t numVertices);
132     };
133 
134 #if defined(__OGRE_SIMD_ALIGN_STACK)
135     /** Stack-align implementation of OptimisedUtil.
136     @remarks
137         User code compiled by icc and gcc might not align stack
138         properly, we need ensure stack align to a 16-bytes boundary
139         when execute SSE function.
140     @par
141         We implemeted as align stack following a virtual function call,
142         then should guarantee call instruction are used instead of inline
143         underlying function body here (which might causing problem).
144     @note
145         Don't use this class directly, use OptimisedUtil instead.
146     */
147     class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil
148     {
149     protected:
150         /// The actual implementation
151         OptimisedUtil* mImpl;
152 
153     public:
154         /// Constructor
OptimisedUtilWithStackAlign(OptimisedUtil * impl)155         OptimisedUtilWithStackAlign(OptimisedUtil* impl)
156             : mImpl(impl)
157         {
158         }
159 
160         /// @copydoc OptimisedUtil::softwareVertexSkinning
softwareVertexSkinning(const float * srcPosPtr,float * destPosPtr,const float * srcNormPtr,float * destNormPtr,const float * blendWeightPtr,const unsigned char * blendIndexPtr,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)161         virtual void softwareVertexSkinning(
162             const float *srcPosPtr, float *destPosPtr,
163             const float *srcNormPtr, float *destNormPtr,
164             const float *blendWeightPtr, const unsigned char* blendIndexPtr,
165             const Affine3* const* blendMatrices,
166             size_t srcPosStride, size_t destPosStride,
167             size_t srcNormStride, size_t destNormStride,
168             size_t blendWeightStride, size_t blendIndexStride,
169             size_t numWeightsPerVertex,
170             size_t numVertices)
171         {
172             __OGRE_SIMD_ALIGN_STACK();
173 
174             mImpl->softwareVertexSkinning(
175                 srcPosPtr, destPosPtr,
176                 srcNormPtr, destNormPtr,
177                 blendWeightPtr, blendIndexPtr,
178                 blendMatrices,
179                 srcPosStride, destPosStride,
180                 srcNormStride, destNormStride,
181                 blendWeightStride, blendIndexStride,
182                 numWeightsPerVertex,
183                 numVertices);
184         }
185 
186         /// @copydoc OptimisedUtil::softwareVertexMorph
softwareVertexMorph(Real t,const float * srcPos1,const float * srcPos2,float * dstPos,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)187         virtual void softwareVertexMorph(
188             Real t,
189             const float *srcPos1, const float *srcPos2,
190             float *dstPos,
191             size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
192             size_t numVertices,
193             bool morphNormals)
194         {
195             __OGRE_SIMD_ALIGN_STACK();
196 
197             mImpl->softwareVertexMorph(
198                 t,
199                 srcPos1, srcPos2,
200                 dstPos,
201                 pos1VSize, pos2VSize, dstVSize,
202                 numVertices,
203                 morphNormals);
204         }
205 
206         /// @copydoc OptimisedUtil::concatenateAffineMatrices
concatenateAffineMatrices(const Affine3 & baseMatrix,const Affine3 * srcMatrices,Affine3 * dstMatrices,size_t numMatrices)207         virtual void concatenateAffineMatrices(
208             const Affine3& baseMatrix,
209             const Affine3* srcMatrices,
210             Affine3* dstMatrices,
211             size_t numMatrices)
212         {
213             __OGRE_SIMD_ALIGN_STACK();
214 
215             mImpl->concatenateAffineMatrices(
216                 baseMatrix,
217                 srcMatrices,
218                 dstMatrices,
219                 numMatrices);
220         }
221 
222         /// @copydoc OptimisedUtil::calculateFaceNormals
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)223         virtual void calculateFaceNormals(
224             const float *positions,
225             const EdgeData::Triangle *triangles,
226             Vector4 *faceNormals,
227             size_t numTriangles)
228         {
229             __OGRE_SIMD_ALIGN_STACK();
230 
231             mImpl->calculateFaceNormals(
232                 positions,
233                 triangles,
234                 faceNormals,
235                 numTriangles);
236         }
237 
238         /// @copydoc OptimisedUtil::calculateLightFacing
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)239         virtual void calculateLightFacing(
240             const Vector4& lightPos,
241             const Vector4* faceNormals,
242             char* lightFacings,
243             size_t numFaces)
244         {
245             __OGRE_SIMD_ALIGN_STACK();
246 
247             mImpl->calculateLightFacing(
248                 lightPos,
249                 faceNormals,
250                 lightFacings,
251                 numFaces);
252         }
253 
254         /// @copydoc OptimisedUtil::extrudeVertices
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * srcPositions,float * destPositions,size_t numVertices)255         virtual void extrudeVertices(
256             const Vector4& lightPos,
257             Real extrudeDist,
258             const float* srcPositions,
259             float* destPositions,
260             size_t numVertices)
261         {
262             __OGRE_SIMD_ALIGN_STACK();
263 
264             mImpl->extrudeVertices(
265                 lightPos,
266                 extrudeDist,
267                 srcPositions,
268                 destPositions,
269                 numVertices);
270         }
271     };
272 #endif  // !defined(__OGRE_SIMD_ALIGN_STACK)
273 
274 //---------------------------------------------------------------------
275 // Some useful macro for collapse matrices.
276 //---------------------------------------------------------------------
277 
278 #define __LOAD_MATRIX(row0, row1, row2, pMatrix)                        \
279     {                                                                   \
280         row0 = __MM_LOAD_PS((*pMatrix)[0]);                             \
281         row1 = __MM_LOAD_PS((*pMatrix)[1]);                             \
282         row2 = __MM_LOAD_PS((*pMatrix)[2]);                             \
283     }
284 
285 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix)                \
286     {                                                                   \
287         row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \
288         row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \
289         row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \
290     }
291 
292 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)       \
293     {                                                                   \
294         row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight);         \
295         row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight);         \
296         row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight);         \
297     }
298 
299 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix)      \
300     {                                                                   \
301         row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \
302         row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \
303         row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \
304     }
305 
306 //---------------------------------------------------------------------
307 // The following macros request variables declared by caller.
308 //
309 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy.
310 //---------------------------------------------------------------------
311 
312 /** Collapse one-weighted matrix.
313     Eliminated multiply by weight since the weight should be equal to one always
314 */
315 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
316     {                                                                           \
317         pMatrix0 = blendMatrices[pIndices[0]];                                  \
318         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
319     }
320 
321 /** Collapse two-weighted matrix.
322     Based on the fact that accumulated weights are equal to one, by use lerp,
323     replaced two multiplies and one additive with one multiplie and two additives.
324 */
325 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
326     {                                                                           \
327         weight = _mm_load_ps1(pWeights + 1);                                    \
328         pMatrix0 = ppMatrices[pIndices[0]];                                     \
329         __LOAD_MATRIX(row0, row1, row2, pMatrix0);                              \
330         pMatrix1 = ppMatrices[pIndices[1]];                                     \
331         __LERP_MATRIX(row0, row1, row2, weight, pMatrix1);                      \
332     }
333 
334 /** Collapse three-weighted matrix.
335 */
336 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
337     {                                                                           \
338         weight = _mm_load_ps1(pWeights + 0);                                    \
339         pMatrix0 = ppMatrices[pIndices[0]];                                     \
340         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
341         weight = _mm_load_ps1(pWeights + 1);                                    \
342         pMatrix1 = ppMatrices[pIndices[1]];                                     \
343         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
344         weight = _mm_load_ps1(pWeights + 2);                                    \
345         pMatrix2 = ppMatrices[pIndices[2]];                                     \
346         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
347     }
348 
349 /** Collapse four-weighted matrix.
350 */
351 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights)  \
352     {                                                                           \
353         /* Load four blend weights at one time, they will be shuffled later */  \
354         weights = _mm_loadu_ps(pWeights);                                       \
355                                                                                 \
356         pMatrix0 = ppMatrices[pIndices[0]];                                     \
357         weight = __MM_SELECT(weights, 0);                                       \
358         __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0);             \
359         pMatrix1 = ppMatrices[pIndices[1]];                                     \
360         weight = __MM_SELECT(weights, 1);                                       \
361         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1);            \
362         pMatrix2 = ppMatrices[pIndices[2]];                                     \
363         weight = __MM_SELECT(weights, 2);                                       \
364         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2);            \
365         pMatrix3 = ppMatrices[pIndices[3]];                                     \
366         weight = __MM_SELECT(weights, 3);                                       \
367         __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3);            \
368     }
369 
370 
371 
372     //---------------------------------------------------------------------
373     // Collapse a matrix at one time. The collapsed matrix are weighted by
374     // blend-weights, and then can use to transform corresponding vertex directly.
375     //
376     // I'd like use inline function instead of macro here, but I also want to
377     // ensure compiler integrate this code into its callers (release build at
378     // least), doesn't matter about specific compile options. Inline function
379     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
380     // when implemented as inline function, even if compile with "-O3" option.
381     //
382 #define _collapseOneMatrix(                                                     \
383         m00, m01, m02,                                                          \
384         pBlendWeight, pBlendIndex,                                              \
385         blendMatrices,                                                          \
386         blendWeightStride, blendIndexStride,                                    \
387         numWeightsPerVertex)                                                    \
388     {                                                                           \
389         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
390         /* generate wrong code here!!!                                   */     \
391         const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
392         __m128 weight, weights;                                                 \
393                                                                                 \
394         switch (numWeightsPerVertex)                                            \
395         {                                                                       \
396         default:    /* Just in case and make compiler happy */                  \
397         case 1:                                                                 \
398             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
399                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
400                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
401             break;                                                              \
402                                                                                 \
403         case 2:                                                                 \
404             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
405                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
406                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
407             break;                                                              \
408                                                                                 \
409         case 3:                                                                 \
410             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
411                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
412                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
413             break;                                                              \
414                                                                                 \
415         case 4:                                                                 \
416             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
417                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
418                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
419             break;                                                              \
420         }                                                                       \
421     }
422 
423     //---------------------------------------------------------------------
424     // Collapse four matrices at one time. The collapsed matrix are weighted by
425     // blend-weights, and then can use to transform corresponding vertex directly.
426     //
427     // I'd like use inline function instead of macro here, but I also want to
428     // ensure compiler integrate this code into its callers (release build at
429     // least), doesn't matter about specific compile options. Inline function
430     // work fine for VC, but looks like gcc (3.4.4 here) generate function-call
431     // when implemented as inline function, even if compile with "-O3" option.
432     //
433 #define _collapseFourMatrices(                                                  \
434         m00, m01, m02,                                                          \
435         m10, m11, m12,                                                          \
436         m20, m21, m22,                                                          \
437         m30, m31, m32,                                                          \
438         pBlendWeight, pBlendIndex,                                              \
439         blendMatrices,                                                          \
440         blendWeightStride, blendIndexStride,                                    \
441         numWeightsPerVertex)                                                    \
442     {                                                                           \
443         /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */     \
444         /* generate wrong code here!!!                                   */     \
445         const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3;               \
446         __m128 weight, weights;                                                 \
447                                                                                 \
448         switch (numWeightsPerVertex)                                            \
449         {                                                                       \
450         default:    /* Just in case and make compiler happy */                  \
451         case 1:                                                                 \
452             __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices,                  \
453                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
454                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
455             __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices,                  \
456                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
457                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
458             __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices,                  \
459                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
460                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
461             __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices,                  \
462                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
463                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
464             break;                                                              \
465                                                                                 \
466         case 2:                                                                 \
467             __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices,                  \
468                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
469                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
470             __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices,                  \
471                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
472                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
473             __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices,                  \
474                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
475                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
476             __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices,                  \
477                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
478                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
479             break;                                                              \
480                                                                                 \
481         case 3:                                                                 \
482             __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices,                  \
483                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
484                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
485             __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices,                  \
486                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
487                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
488             __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices,                  \
489                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
490                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
491             __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices,                  \
492                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
493                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
494             break;                                                              \
495                                                                                 \
496         case 4:                                                                 \
497             __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices,                  \
498                 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride),            \
499                 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride));         \
500             __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices,                  \
501                 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride),            \
502                 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride));         \
503             __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices,                  \
504                 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride),            \
505                 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride));         \
506             __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices,                  \
507                 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride),            \
508                 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride));         \
509             break;                                                              \
510         }                                                                       \
511     }
512 
513 
514     //---------------------------------------------------------------------
515     // General SSE version skinning positions, and optional skinning normals.
softwareVertexSkinning_SSE_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)516     static void softwareVertexSkinning_SSE_General(
517         const float *pSrcPos, float *pDestPos,
518         const float *pSrcNorm, float *pDestNorm,
519         const float *pBlendWeight, const unsigned char* pBlendIndex,
520         const Affine3* const* blendMatrices,
521         size_t srcPosStride, size_t destPosStride,
522         size_t srcNormStride, size_t destNormStride,
523         size_t blendWeightStride, size_t blendIndexStride,
524         size_t numWeightsPerVertex,
525         size_t numVertices)
526     {
527         for (size_t i = 0; i < numVertices; ++i)
528         {
529             // Collapse matrices
530             __m128 m00, m01, m02;
531             _collapseOneMatrix(
532                 m00, m01, m02,
533                 pBlendWeight, pBlendIndex,
534                 blendMatrices,
535                 blendWeightStride, blendIndexStride,
536                 numWeightsPerVertex);
537 
538             // Advance blend weight and index pointers
539             advanceRawPointer(pBlendWeight, blendWeightStride);
540             advanceRawPointer(pBlendIndex, blendIndexStride);
541 
542             //------------------------------------------------------------------
543 
544             // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y
545             __m128 m03 = _mm_setzero_ps();
546             __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01);
547 
548             //------------------------------------------------------------------
549             // Transform position
550             //------------------------------------------------------------------
551 
552             __m128 s0, s1, s2;
553 
554             // Load source position
555             s0 = _mm_load_ps1(pSrcPos + 0);
556             s1 = _mm_load_ps1(pSrcPos + 1);
557             s2 = _mm_load_ps1(pSrcPos + 2);
558 
559             // Transform by collapsed matrix
560             __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2);   // z 0 x y
561 
562             // Store blended position, no aligned requirement
563             _mm_storeh_pi((__m64*)pDestPos, accumPos);
564             _mm_store_ss(pDestPos+2, accumPos);
565 
566             // Advance source and target position pointers
567             advanceRawPointer(pSrcPos, srcPosStride);
568             advanceRawPointer(pDestPos, destPosStride);
569 
570             //------------------------------------------------------------------
571             // Optional blend normal
572             //------------------------------------------------------------------
573 
574             if (pSrcNorm)
575             {
576                 // Load source normal
577                 s0 = _mm_load_ps1(pSrcNorm + 0);
578                 s1 = _mm_load_ps1(pSrcNorm + 1);
579                 s2 = _mm_load_ps1(pSrcNorm + 2);
580 
581                 // Transform by collapsed matrix
582                 __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2);   // z 0 x y
583 
584                 // Normalise normal
585                 __m128 tmp = _mm_mul_ps(accumNorm, accumNorm);                  // z^2 0 x^2 y^2
586                 tmp = __MM_ACCUM3_PS(tmp,
587                         _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)),         // x^2 0 y^2 z^2
588                         _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3)));        // y^2 0 z^2 x^2
589                 // Note: zero divided here, but neglectable
590                 tmp = __MM_RSQRT_PS(tmp);
591                 accumNorm = _mm_mul_ps(accumNorm, tmp);
592 
593                 // Store blended normal, no aligned requirement
594                 _mm_storeh_pi((__m64*)pDestNorm, accumNorm);
595                 _mm_store_ss(pDestNorm+2, accumNorm);
596 
597                 // Advance source and target normal pointers
598                 advanceRawPointer(pSrcNorm, srcNormStride);
599                 advanceRawPointer(pDestNorm, destNormStride);
600             }
601         }
602     }
603     //---------------------------------------------------------------------
604     // Special SSE version skinning shared buffers of position and normal,
605     // and the buffer are packed.
606     template <bool srcAligned, bool destAligned>
607     struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed
608     {
applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed609         static void apply(
610             const float* pSrc, float* pDest,
611             const float* pBlendWeight, const unsigned char* pBlendIndex,
612             const Affine3* const* blendMatrices,
613             size_t blendWeightStride, size_t blendIndexStride,
614             size_t numWeightsPerVertex,
615             size_t numIterations)
616         {
617             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
618             typedef SSEMemoryAccessor<destAligned> DestAccessor;
619 
620             // Blending 4 vertices per-iteration
621             for (size_t i = 0; i < numIterations; ++i)
622             {
623                 // Collapse matrices
624                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
625                 _collapseFourMatrices(
626                     m00, m01, m02,
627                     m10, m11, m12,
628                     m20, m21, m22,
629                     m30, m31, m32,
630                     pBlendWeight, pBlendIndex,
631                     blendMatrices,
632                     blendWeightStride, blendIndexStride,
633                     numWeightsPerVertex);
634 
635                 // Advance 4 vertices
636                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
637                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
638 
639                 //------------------------------------------------------------------
640                 // Transform position/normals
641                 //------------------------------------------------------------------
642 
643                 __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5;
644                 __m128 t0, t1, t2, t3, t4, t5;
645 
646                 // Load source position/normals
647                 s0 = SrcAccessor::load(pSrc + 0);                       // px0 py0 pz0 nx0
648                 s1 = SrcAccessor::load(pSrc + 4);                       // ny0 nz0 px1 py1
649                 s2 = SrcAccessor::load(pSrc + 8);                       // pz1 nx1 ny1 nz1
650                 s3 = SrcAccessor::load(pSrc + 12);                      // px2 py2 pz2 nx2
651                 s4 = SrcAccessor::load(pSrc + 16);                      // ny2 nz2 px3 py3
652                 s5 = SrcAccessor::load(pSrc + 20);                      // pz3 nx3 ny3 nz3
653 
654                 // Rearrange to component-major for batches calculate.
655 
656                 t0 = _mm_unpacklo_ps(s0, s3);                           // px0 px2 py0 py2
657                 t1 = _mm_unpackhi_ps(s0, s3);                           // pz0 pz2 nx0 nx2
658                 t2 = _mm_unpacklo_ps(s1, s4);                           // ny0 ny2 nz0 nz2
659                 t3 = _mm_unpackhi_ps(s1, s4);                           // px1 px3 py1 py3
660                 t4 = _mm_unpacklo_ps(s2, s5);                           // pz1 pz3 nx1 nx3
661                 t5 = _mm_unpackhi_ps(s2, s5);                           // ny1 ny3 nz1 nz3
662 
663                 s0 = _mm_unpacklo_ps(t0, t3);                           // px0 px1 px2 px3
664                 s1 = _mm_unpackhi_ps(t0, t3);                           // py0 py1 py2 py3
665                 s2 = _mm_unpacklo_ps(t1, t4);                           // pz0 pz1 pz2 pz3
666                 s3 = _mm_unpackhi_ps(t1, t4);                           // nx0 nx1 nx2 nx3
667                 s4 = _mm_unpacklo_ps(t2, t5);                           // ny0 ny1 ny2 ny3
668                 s5 = _mm_unpackhi_ps(t2, t5);                           // nz0 nz1 nz2 nz3
669 
670                 // Transform by collapsed matrix
671 
672                 // Shuffle row 0 of four collapsed matrices for calculate X component
673                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
674 
675                 // Transform X components
676                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // PX0 PX1 PX2 PX3
677                 d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5);         // NX0 NX1 NX2 NX3
678 
679                 // Shuffle row 1 of four collapsed matrices for calculate Y component
680                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
681 
682                 // Transform Y components
683                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // PY0 PY1 PY2 PY3
684                 d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5);         // NY0 NY1 NY2 NY3
685 
686                 // Shuffle row 2 of four collapsed matrices for calculate Z component
687                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
688 
689                 // Transform Z components
690                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // PZ0 PZ1 PZ2 PZ3
691                 d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5);         // NZ0 NZ1 NZ2 NZ3
692 
693                 // Normalise normals
694                 __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5);
695                 tmp = __MM_RSQRT_PS(tmp);
696                 d3 = _mm_mul_ps(d3, tmp);
697                 d4 = _mm_mul_ps(d4, tmp);
698                 d5 = _mm_mul_ps(d5, tmp);
699 
700                 // Arrange back to continuous format for store results
701 
702                 t0 = _mm_unpacklo_ps(d0, d1);                           // PX0 PY0 PX1 PY1
703                 t1 = _mm_unpackhi_ps(d0, d1);                           // PX2 PY2 PX3 PY3
704                 t2 = _mm_unpacklo_ps(d2, d3);                           // PZ0 NX0 PZ1 NX1
705                 t3 = _mm_unpackhi_ps(d2, d3);                           // PZ2 NX2 PZ3 NX3
706                 t4 = _mm_unpacklo_ps(d4, d5);                           // NY0 NZ0 NY1 NZ1
707                 t5 = _mm_unpackhi_ps(d4, d5);                           // NY2 NZ2 NY3 NZ3
708 
709                 d0 = _mm_movelh_ps(t0, t2);                             // PX0 PY0 PZ0 NX0
710                 d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0));      // NY0 NZ0 PX1 PY1
711                 d2 = _mm_movehl_ps(t4, t2);                             // PZ1 NX1 NY1 NZ1
712                 d3 = _mm_movelh_ps(t1, t3);                             // PX2 PY2 PZ2 NX2
713                 d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0));      // NY2 NZ2 PX3 PY3
714                 d5 = _mm_movehl_ps(t5, t3);                             // PZ3 NX3 NY3 NZ3
715 
716                 // Store blended position/normals
717                 DestAccessor::store(pDest + 0, d0);
718                 DestAccessor::store(pDest + 4, d1);
719                 DestAccessor::store(pDest + 8, d2);
720                 DestAccessor::store(pDest + 12, d3);
721                 DestAccessor::store(pDest + 16, d4);
722                 DestAccessor::store(pDest + 20, d5);
723 
724                 // Advance 4 vertices
725                 pSrc += 4 * (3 + 3);
726                 pDest += 4 * (3 + 3);
727             }
728         }
729     };
softwareVertexSkinning_SSE_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)730     static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
731             const float* pSrcPos, float* pDestPos,
732             const float* pBlendWeight, const unsigned char* pBlendIndex,
733             const Affine3* const* blendMatrices,
734             size_t blendWeightStride, size_t blendIndexStride,
735             size_t numWeightsPerVertex,
736             size_t numIterations)
737     {
738         // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex
739 
740         // Instantiating two version only, since other alignment combinations are not that important.
741         if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos))
742         {
743             SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply(
744                 pSrcPos, pDestPos,
745                 pBlendWeight, pBlendIndex,
746                 blendMatrices,
747                 blendWeightStride, blendIndexStride,
748                 numWeightsPerVertex,
749                 numIterations);
750         }
751         else
752         {
753             SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply(
754                 pSrcPos, pDestPos,
755                 pBlendWeight, pBlendIndex,
756                 blendMatrices,
757                 blendWeightStride, blendIndexStride,
758                 numWeightsPerVertex,
759                 numIterations);
760         }
761     }
762     //---------------------------------------------------------------------
763     // Special SSE version skinning separated buffers of position and normal,
764     // both of position and normal buffer are packed.
765     template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned>
766     struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed
767     {
applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed768         static void apply(
769             const float* pSrcPos, float* pDestPos,
770             const float* pSrcNorm, float* pDestNorm,
771             const float* pBlendWeight, const unsigned char* pBlendIndex,
772             const Affine3* const* blendMatrices,
773             size_t blendWeightStride, size_t blendIndexStride,
774             size_t numWeightsPerVertex,
775             size_t numIterations)
776         {
777             typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
778             typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
779             typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor;
780             typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor;
781 
782             // Blending 4 vertices per-iteration
783             for (size_t i = 0; i < numIterations; ++i)
784             {
785                 // Collapse matrices
786                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
787                 _collapseFourMatrices(
788                     m00, m01, m02,
789                     m10, m11, m12,
790                     m20, m21, m22,
791                     m30, m31, m32,
792                     pBlendWeight, pBlendIndex,
793                     blendMatrices,
794                     blendWeightStride, blendIndexStride,
795                     numWeightsPerVertex);
796 
797                 // Advance 4 vertices
798                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
799                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
800 
801                 //------------------------------------------------------------------
802                 // Transform positions
803                 //------------------------------------------------------------------
804 
805                 __m128 s0, s1, s2, d0, d1, d2;
806 
807                 // Load source positions
808                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
809                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
810                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
811 
812                 // Arrange to 3x4 component-major for batches calculate
813                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
814 
815                 // Transform by collapsed matrix
816 
817                 // Shuffle row 0 of four collapsed matrices for calculate X component
818                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
819 
820                 // Transform X components
821                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
822 
823                 // Shuffle row 1 of four collapsed matrices for calculate Y component
824                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
825 
826                 // Transform Y components
827                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
828 
829                 // Shuffle row 2 of four collapsed matrices for calculate Z component
830                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
831 
832                 // Transform Z components
833                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
834 
835                 // Arrange back to 4x3 continuous format for store results
836                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
837 
838                 // Store blended positions
839                 DestPosAccessor::store(pDestPos + 0, d0);
840                 DestPosAccessor::store(pDestPos + 4, d1);
841                 DestPosAccessor::store(pDestPos + 8, d2);
842 
843                 // Advance 4 vertices
844                 pSrcPos += 4 * 3;
845                 pDestPos += 4 * 3;
846 
847                 //------------------------------------------------------------------
848                 // Transform normals
849                 //------------------------------------------------------------------
850 
851                 // Load source normals
852                 s0 = SrcNormAccessor::load(pSrcNorm + 0);               // x0 y0 z0 x1
853                 s1 = SrcNormAccessor::load(pSrcNorm + 4);               // y1 z1 x2 y2
854                 s2 = SrcNormAccessor::load(pSrcNorm + 8);               // z2 x3 y3 z3
855 
856                 // Arrange to 3x4 component-major for batches calculate
857                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
858 
859                 // Transform by collapsed and shuffled matrices
860                 d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2);         // X0 X1 X2 X3
861                 d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2);         // Y0 Y1 Y2 Y3
862                 d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2);         // Z0 Z1 Z2 Z3
863 
864                 // Normalise normals
865                 __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2);
866                 tmp = __MM_RSQRT_PS(tmp);
867                 d0 = _mm_mul_ps(d0, tmp);
868                 d1 = _mm_mul_ps(d1, tmp);
869                 d2 = _mm_mul_ps(d2, tmp);
870 
871                 // Arrange back to 4x3 continuous format for store results
872                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
873 
874                 // Store blended normals
875                 DestNormAccessor::store(pDestNorm + 0, d0);
876                 DestNormAccessor::store(pDestNorm + 4, d1);
877                 DestNormAccessor::store(pDestNorm + 8, d2);
878 
879                 // Advance 4 vertices
880                 pSrcNorm += 4 * 3;
881                 pDestNorm += 4 * 3;
882             }
883         }
884     };
softwareVertexSkinning_SSE_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)885     static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
886         const float* pSrcPos, float* pDestPos,
887         const float* pSrcNorm, float* pDestNorm,
888         const float* pBlendWeight, const unsigned char* pBlendIndex,
889         const Affine3* const* blendMatrices,
890         size_t blendWeightStride, size_t blendIndexStride,
891         size_t numWeightsPerVertex,
892         size_t numIterations)
893     {
894         assert(_isAlignedForSSE(pSrcPos));
895 
896         // Instantiating two version only, since other alignment combination not that important.
897         if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm))
898         {
899             SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply(
900                 pSrcPos, pDestPos,
901                 pSrcNorm, pDestNorm,
902                 pBlendWeight, pBlendIndex,
903                 blendMatrices,
904                 blendWeightStride, blendIndexStride,
905                 numWeightsPerVertex,
906                 numIterations);
907         }
908         else
909         {
910             SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply(
911                 pSrcPos, pDestPos,
912                 pSrcNorm, pDestNorm,
913                 pBlendWeight, pBlendIndex,
914                 blendMatrices,
915                 blendWeightStride, blendIndexStride,
916                 numWeightsPerVertex,
917                 numIterations);
918         }
919     }
920     //---------------------------------------------------------------------
921     // Special SSE version skinning position only, the position buffer are
922     // packed.
923     template <bool srcPosAligned, bool destPosAligned>
924     struct SoftwareVertexSkinning_SSE_PosOnly_Packed
925     {
applyOgre::SoftwareVertexSkinning_SSE_PosOnly_Packed926         static void apply(
927             const float* pSrcPos, float* pDestPos,
928             const float* pBlendWeight, const unsigned char* pBlendIndex,
929             const Affine3* const* blendMatrices,
930             size_t blendWeightStride, size_t blendIndexStride,
931             size_t numWeightsPerVertex,
932             size_t numIterations)
933         {
934             typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor;
935             typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor;
936 
937             // Blending 4 vertices per-iteration
938             for (size_t i = 0; i < numIterations; ++i)
939             {
940                 // Collapse matrices
941                 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32;
942                 _collapseFourMatrices(
943                     m00, m01, m02,
944                     m10, m11, m12,
945                     m20, m21, m22,
946                     m30, m31, m32,
947                     pBlendWeight, pBlendIndex,
948                     blendMatrices,
949                     blendWeightStride, blendIndexStride,
950                     numWeightsPerVertex);
951 
952                 // Advance 4 vertices
953                 advanceRawPointer(pBlendWeight, 4 * blendWeightStride);
954                 advanceRawPointer(pBlendIndex, 4 * blendIndexStride);
955 
956                 //------------------------------------------------------------------
957                 // Transform positions
958                 //------------------------------------------------------------------
959 
960                 __m128 s0, s1, s2, d0, d1, d2;
961 
962                 // Load source positions
963                 s0 = SrcPosAccessor::load(pSrcPos + 0);                 // x0 y0 z0 x1
964                 s1 = SrcPosAccessor::load(pSrcPos + 4);                 // y1 z1 x2 y2
965                 s2 = SrcPosAccessor::load(pSrcPos + 8);                 // z2 x3 y3 z3
966 
967                 // Arrange to 3x4 component-major for batches calculate
968                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
969 
970                 // Transform by collapsed matrix
971 
972                 // Shuffle row 0 of four collapsed matrices for calculate X component
973                 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30);
974 
975                 // Transform X components
976                 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2);    // X0 X1 X2 X3
977 
978                 // Shuffle row 1 of four collapsed matrices for calculate Y component
979                 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31);
980 
981                 // Transform Y components
982                 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2);    // Y0 Y1 Y2 Y3
983 
984                 // Shuffle row 2 of four collapsed matrices for calculate Z component
985                 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32);
986 
987                 // Transform Z components
988                 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2);    // Z0 Z1 Z2 Z3
989 
990                 // Arrange back to 4x3 continuous format for store results
991                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
992 
993                 // Store blended positions
994                 DestPosAccessor::store(pDestPos + 0, d0);
995                 DestPosAccessor::store(pDestPos + 4, d1);
996                 DestPosAccessor::store(pDestPos + 8, d2);
997 
998                 // Advance 4 vertices
999                 pSrcPos += 4 * 3;
1000                 pDestPos += 4 * 3;
1001             }
1002         }
1003     };
softwareVertexSkinning_SSE_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)1004     static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosOnly_Packed(
1005         const float* pSrcPos, float* pDestPos,
1006         const float* pBlendWeight, const unsigned char* pBlendIndex,
1007         const Affine3* const* blendMatrices,
1008         size_t blendWeightStride, size_t blendIndexStride,
1009         size_t numWeightsPerVertex,
1010         size_t numIterations)
1011     {
1012         assert(_isAlignedForSSE(pSrcPos));
1013 
1014         // Instantiating two version only, since other alignment combination not that important.
1015         if (_isAlignedForSSE(pDestPos))
1016         {
1017             SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply(
1018                 pSrcPos, pDestPos,
1019                 pBlendWeight, pBlendIndex,
1020                 blendMatrices,
1021                 blendWeightStride, blendIndexStride,
1022                 numWeightsPerVertex,
1023                 numIterations);
1024         }
1025         else
1026         {
1027             SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply(
1028                 pSrcPos, pDestPos,
1029                 pBlendWeight, pBlendIndex,
1030                 blendMatrices,
1031                 blendWeightStride, blendIndexStride,
1032                 numWeightsPerVertex,
1033                 numIterations);
1034         }
1035     }
1036     //---------------------------------------------------------------------
1037     //---------------------------------------------------------------------
1038     //---------------------------------------------------------------------
OptimisedUtilSSE(void)1039     OptimisedUtilSSE::OptimisedUtilSSE(void)
1040         : mPreferGeneralVersionForSharedBuffers(false)
1041     {
1042         // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use
1043         // unrolled version for shared buffers at all, I guess because that version
1044         // run out of usable CPU registers, or L1/L2 cache related problem, causing
1045         // slight performance loss than general version.
1046         //
1047 #if __OGRE_HAVE_NEON == 0
1048         if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos)
1049         {
1050             // How can I check it's an Athlon XP but not Althon 64?
1051             // Ok, just test whether supports SSE2/SSE3 or not, if not,
1052             // assume general version faster than unrolled version :)
1053             //
1054             if (!(PlatformInformation::getCpuFeatures() &
1055                 (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3)))
1056             {
1057                 mPreferGeneralVersionForSharedBuffers = true;
1058             }
1059         }
1060 #endif
1061     }
1062     //---------------------------------------------------------------------
softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1063     void OptimisedUtilSSE::softwareVertexSkinning(
1064         const float *pSrcPos, float *pDestPos,
1065         const float *pSrcNorm, float *pDestNorm,
1066         const float *pBlendWeight, const unsigned char* pBlendIndex,
1067         const Affine3* const* blendMatrices,
1068         size_t srcPosStride, size_t destPosStride,
1069         size_t srcNormStride, size_t destNormStride,
1070         size_t blendWeightStride, size_t blendIndexStride,
1071         size_t numWeightsPerVertex,
1072         size_t numVertices)
1073     {
1074         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1075 
1076         // All position/normal pointers should be perfect aligned, but still check here
1077         // for avoid hardware buffer which allocated by potential buggy driver doesn't
1078         // support alignment properly.
1079         // Because we are used meta-function technique here, the code is easy to maintenance
1080         // and still provides all possible alignment combination.
1081         //
1082 
1083         // Use unrolled routines only if there a lot of vertices
1084         if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES)
1085         {
1086             if (pSrcNorm)
1087             {
1088                 // Blend position and normal
1089 
1090                 if (!mPreferGeneralVersionForSharedBuffers &&
1091                     srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) &&
1092                     pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3)
1093                 {
1094                     // Position and normal are sharing with packed buffer
1095 
1096                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1097                     assert((srcPosAlign & 3) == 0);
1098 
1099                     // Blend unaligned vertices with general SIMD routine
1100                     if (srcPosAlign == 8)   // Because 8 bytes alignment shift per-vertex
1101                     {
1102                         size_t count = srcPosAlign / 8;
1103                         numVertices -= count;
1104                         softwareVertexSkinning_SSE_General(
1105                             pSrcPos, pDestPos,
1106                             pSrcNorm, pDestNorm,
1107                             pBlendWeight, pBlendIndex,
1108                             blendMatrices,
1109                             srcPosStride, destPosStride,
1110                             srcNormStride, destNormStride,
1111                             blendWeightStride, blendIndexStride,
1112                             numWeightsPerVertex,
1113                             count);
1114 
1115                         pSrcPos += count * (3 + 3);
1116                         pDestPos += count * (3 + 3);
1117                         pSrcNorm += count * (3 + 3);
1118                         pDestNorm += count * (3 + 3);
1119                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1120                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1121                     }
1122 
1123                     // Blend vertices, four vertices per-iteration
1124                     size_t numIterations = numVertices / 4;
1125                     softwareVertexSkinning_SSE_PosNorm_Shared_Packed(
1126                         pSrcPos, pDestPos,
1127                         pBlendWeight, pBlendIndex,
1128                         blendMatrices,
1129                         blendWeightStride, blendIndexStride,
1130                         numWeightsPerVertex,
1131                         numIterations);
1132 
1133                     // Advance pointers for remaining vertices
1134                     numVertices &= 3;
1135                     if (numVertices)
1136                     {
1137                         pSrcPos += numIterations * 4 * (3 + 3);
1138                         pDestPos += numIterations * 4 * (3 + 3);
1139                         pSrcNorm += numIterations * 4 * (3 + 3);
1140                         pDestNorm += numIterations * 4 * (3 + 3);
1141                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1142                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1143                     }
1144                 }
1145                 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 &&
1146                          srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3)
1147                 {
1148                     // Position and normal are separate buffers, and all of them are packed
1149 
1150                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1151                     assert((srcPosAlign & 3) == 0);
1152 
1153                     // Blend unaligned vertices with general SIMD routine
1154                     if (srcPosAlign)
1155                     {
1156                         size_t count = srcPosAlign / 4;
1157                         numVertices -= count;
1158                         softwareVertexSkinning_SSE_General(
1159                             pSrcPos, pDestPos,
1160                             pSrcNorm, pDestNorm,
1161                             pBlendWeight, pBlendIndex,
1162                             blendMatrices,
1163                             srcPosStride, destPosStride,
1164                             srcNormStride, destNormStride,
1165                             blendWeightStride, blendIndexStride,
1166                             numWeightsPerVertex,
1167                             count);
1168 
1169                         pSrcPos += count * 3;
1170                         pDestPos += count * 3;
1171                         pSrcNorm += count * 3;
1172                         pDestNorm += count * 3;
1173                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1174                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1175                     }
1176 
1177                     // Blend vertices, four vertices per-iteration
1178                     size_t numIterations = numVertices / 4;
1179                     softwareVertexSkinning_SSE_PosNorm_Separated_Packed(
1180                         pSrcPos, pDestPos,
1181                         pSrcNorm, pDestNorm,
1182                         pBlendWeight, pBlendIndex,
1183                         blendMatrices,
1184                         blendWeightStride, blendIndexStride,
1185                         numWeightsPerVertex,
1186                         numIterations);
1187 
1188                     // Advance pointers for remaining vertices
1189                     numVertices &= 3;
1190                     if (numVertices)
1191                     {
1192                         pSrcPos += numIterations * 4 * 3;
1193                         pDestPos += numIterations * 4 * 3;
1194                         pSrcNorm += numIterations * 4 * 3;
1195                         pDestNorm += numIterations * 4 * 3;
1196                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1197                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1198                     }
1199                 }
1200                 else    // Not 'packed' form or wrong order between position and normal
1201                 {
1202                     // Should never occur, do nothing here just in case
1203                 }
1204             }
1205             else    // !pSrcNorm
1206             {
1207                 // Blend position only
1208 
1209                 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3)
1210                 {
1211                     // All buffers are packed
1212 
1213                     size_t srcPosAlign = (size_t)pSrcPos & 15;
1214                     assert((srcPosAlign & 3) == 0);
1215 
1216                     // Blend unaligned vertices with general SIMD routine
1217                     if (srcPosAlign)
1218                     {
1219                         size_t count = srcPosAlign / 4;
1220                         numVertices -= count;
1221                         softwareVertexSkinning_SSE_General(
1222                             pSrcPos, pDestPos,
1223                             pSrcNorm, pDestNorm,
1224                             pBlendWeight, pBlendIndex,
1225                             blendMatrices,
1226                             srcPosStride, destPosStride,
1227                             srcNormStride, destNormStride,
1228                             blendWeightStride, blendIndexStride,
1229                             numWeightsPerVertex,
1230                             count);
1231 
1232                         pSrcPos += count * 3;
1233                         pDestPos += count * 3;
1234                         advanceRawPointer(pBlendWeight, count * blendWeightStride);
1235                         advanceRawPointer(pBlendIndex, count * blendIndexStride);
1236                     }
1237 
1238                     // Blend vertices, four vertices per-iteration
1239                     size_t numIterations = numVertices / 4;
1240                     softwareVertexSkinning_SSE_PosOnly_Packed(
1241                         pSrcPos, pDestPos,
1242                         pBlendWeight, pBlendIndex,
1243                         blendMatrices,
1244                         blendWeightStride, blendIndexStride,
1245                         numWeightsPerVertex,
1246                         numIterations);
1247 
1248                     // Advance pointers for remaining vertices
1249                     numVertices &= 3;
1250                     if (numVertices)
1251                     {
1252                         pSrcPos += numIterations * 4 * 3;
1253                         pDestPos += numIterations * 4 * 3;
1254                         advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride);
1255                         advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride);
1256                     }
1257                 }
1258                 else    // Not 'packed' form
1259                 {
1260                     // Might occur only if user forced software blending position only
1261                 }
1262             }
1263         }
1264 
1265         // Blend remaining vertices, need to do it with SIMD for identical result,
1266         // since mixing general floating-point and SIMD algorithm will causing
1267         // floating-point error.
1268         if (numVertices)
1269         {
1270             softwareVertexSkinning_SSE_General(
1271                 pSrcPos, pDestPos,
1272                 pSrcNorm, pDestNorm,
1273                 pBlendWeight, pBlendIndex,
1274                 blendMatrices,
1275                 srcPosStride, destPosStride,
1276                 srcNormStride, destNormStride,
1277                 blendWeightStride, blendIndexStride,
1278                 numWeightsPerVertex,
1279                 numVertices);
1280         }
1281     }
1282     //---------------------------------------------------------------------
softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1283     void OptimisedUtilSSE::softwareVertexMorph(
1284         Real t,
1285         const float *pSrc1, const float *pSrc2,
1286         float *pDst,
1287         size_t pos1VSize, size_t pos2VSize, size_t dstVSize,
1288         size_t numVertices,
1289         bool morphNormals)
1290     {
1291         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1292 
1293         __m128 src01, src02, src11, src12, src21, src22;
1294         __m128 dst0, dst1, dst2;
1295 
1296         __m128 t4 = _mm_load_ps1(&t);
1297 
1298 
1299         // If we're morphing normals, we have twice the number of floats to process
1300         // Positions are interleaved with normals, so we'll have to separately
1301         // normalise just the normals later; we'll just lerp in the first pass
1302         // We can't normalise as we go because normals & positions are only 3 floats
1303         // each so are not aligned for SSE, we'd mix the data up
1304         size_t normalsMultiplier = morphNormals ? 2 : 1;
1305         size_t numIterations = (numVertices*normalsMultiplier) / 4;
1306         size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3;
1307 
1308         // Save for later
1309         float *pStartDst = pDst;
1310 
1311         // Never use meta-function technique to accessing memory because looks like
1312         // VC7.1 generate a bit inefficient binary code when put following code into
1313         // inline function.
1314 
1315         if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst))
1316         {
1317             // All data aligned
1318 
1319             // Morph 4 vertices per-iteration. Special designed for use all
1320             // available CPU registers as possible (7 registers used here),
1321             // and avoid temporary values allocated in stack for suppress
1322             // extra memory access.
1323             for (size_t i = 0; i < numIterations; ++i)
1324             {
1325                 // 12 floating-point values
1326                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1327                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1328                 src11 = __MM_LOAD_PS(pSrc1 + 4);
1329                 src12 = __MM_LOAD_PS(pSrc2 + 4);
1330                 src21 = __MM_LOAD_PS(pSrc1 + 8);
1331                 src22 = __MM_LOAD_PS(pSrc2 + 8);
1332                 pSrc1 += 12; pSrc2 += 12;
1333 
1334                 dst0 = __MM_LERP_PS(t4, src01, src02);
1335                 dst1 = __MM_LERP_PS(t4, src11, src12);
1336                 dst2 = __MM_LERP_PS(t4, src21, src22);
1337 
1338                 __MM_STORE_PS(pDst + 0, dst0);
1339                 __MM_STORE_PS(pDst + 4, dst1);
1340                 __MM_STORE_PS(pDst + 8, dst2);
1341                 pDst += 12;
1342             }
1343 
1344             // Morph remaining vertices
1345             switch (numVerticesRemainder)
1346             {
1347             case 3:
1348                 // 9 floating-point values
1349                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1350                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1351                 src11 = __MM_LOAD_PS(pSrc1 + 4);
1352                 src12 = __MM_LOAD_PS(pSrc2 + 4);
1353                 src21 = _mm_load_ss(pSrc1 + 8);
1354                 src22 = _mm_load_ss(pSrc2 + 8);
1355 
1356                 dst0 = __MM_LERP_PS(t4, src01, src02);
1357                 dst1 = __MM_LERP_PS(t4, src11, src12);
1358                 dst2 = __MM_LERP_SS(t4, src21, src22);
1359 
1360                 __MM_STORE_PS(pDst + 0, dst0);
1361                 __MM_STORE_PS(pDst + 4, dst1);
1362                 _mm_store_ss(pDst + 8, dst2);
1363                 break;
1364 
1365             case 2:
1366                 // 6 floating-point values
1367                 src01 = __MM_LOAD_PS(pSrc1 + 0);
1368                 src02 = __MM_LOAD_PS(pSrc2 + 0);
1369                 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1370                 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1371 
1372                 dst0 = __MM_LERP_PS(t4, src01, src02);
1373                 dst1 = __MM_LERP_PS(t4, src11, src12);
1374 
1375                 __MM_STORE_PS(pDst + 0, dst0);
1376                 _mm_storel_pi((__m64*)(pDst + 4), dst1);
1377                 break;
1378 
1379             case 1:
1380                 // 3 floating-point values
1381                 src01 = _mm_load_ss(pSrc1 + 2);
1382                 src02 = _mm_load_ss(pSrc2 + 2);
1383                 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1384                 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1385 
1386                 dst0 = __MM_LERP_PS(t4, src01, src02);
1387 
1388                 _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1389                 _mm_store_ss(pDst + 2, dst0);
1390                 break;
1391             }
1392         }
1393         else    // Should never occur, just in case of buggy drivers
1394         {
1395             // Assume all data unaligned
1396 
1397             // Morph 4 vertices per-iteration. Special designed for use all
1398             // available CPU registers as possible (7 registers used here),
1399             // and avoid temporary values allocated in stack for suppress
1400             // extra memory access.
1401             for (size_t i = 0; i < numIterations; ++i)
1402             {
1403                 // 12 floating-point values
1404                 src01 = _mm_loadu_ps(pSrc1 + 0);
1405                 src02 = _mm_loadu_ps(pSrc2 + 0);
1406                 src11 = _mm_loadu_ps(pSrc1 + 4);
1407                 src12 = _mm_loadu_ps(pSrc2 + 4);
1408                 src21 = _mm_loadu_ps(pSrc1 + 8);
1409                 src22 = _mm_loadu_ps(pSrc2 + 8);
1410                 pSrc1 += 12; pSrc2 += 12;
1411 
1412                 dst0 = __MM_LERP_PS(t4, src01, src02);
1413                 dst1 = __MM_LERP_PS(t4, src11, src12);
1414                 dst2 = __MM_LERP_PS(t4, src21, src22);
1415 
1416                 _mm_storeu_ps(pDst + 0, dst0);
1417                 _mm_storeu_ps(pDst + 4, dst1);
1418                 _mm_storeu_ps(pDst + 8, dst2);
1419                 pDst += 12;
1420 
1421             }
1422 
1423             // Morph remaining vertices
1424             switch (numVerticesRemainder)
1425             {
1426             case 3:
1427                 // 9 floating-point values
1428                 src01 = _mm_loadu_ps(pSrc1 + 0);
1429                 src02 = _mm_loadu_ps(pSrc2 + 0);
1430                 src11 = _mm_loadu_ps(pSrc1 + 4);
1431                 src12 = _mm_loadu_ps(pSrc2 + 4);
1432                 src21 = _mm_load_ss(pSrc1 + 8);
1433                 src22 = _mm_load_ss(pSrc2 + 8);
1434 
1435                 dst0 = __MM_LERP_PS(t4, src01, src02);
1436                 dst1 = __MM_LERP_PS(t4, src11, src12);
1437                 dst2 = __MM_LERP_SS(t4, src21, src22);
1438 
1439                 _mm_storeu_ps(pDst + 0, dst0);
1440                 _mm_storeu_ps(pDst + 4, dst1);
1441                 _mm_store_ss(pDst + 8, dst2);
1442                 break;
1443 
1444             case 2:
1445                 // 6 floating-point values
1446                 src01 = _mm_loadu_ps(pSrc1 + 0);
1447                 src02 = _mm_loadu_ps(pSrc2 + 0);
1448                 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4));  // t4 is meaningless here
1449                 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4));  // t4 is meaningless here
1450 
1451                 dst0 = __MM_LERP_PS(t4, src01, src02);
1452                 dst1 = __MM_LERP_PS(t4, src11, src12);
1453 
1454                 _mm_storeu_ps(pDst + 0, dst0);
1455                 _mm_storel_pi((__m64*)(pDst + 4), dst1);
1456                 break;
1457 
1458             case 1:
1459                 // 3 floating-point values
1460                 src01 = _mm_load_ss(pSrc1 + 2);
1461                 src02 = _mm_load_ss(pSrc2 + 2);
1462                 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0));
1463                 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0));
1464 
1465                 dst0 = __MM_LERP_PS(t4, src01, src02);
1466 
1467                 _mm_storeh_pi((__m64*)(pDst + 0), dst0);
1468                 _mm_store_ss(pDst + 2, dst0);
1469                 break;
1470             }
1471 
1472         }
1473 
1474         if (morphNormals)
1475         {
1476 
1477             // Now we need to do and unaligned normalise on the normals data we just
1478             // lerped; because normals are 3 elements each they're always unaligned
1479             float *pNorm = pStartDst;
1480 
1481             // Offset past first position
1482             pNorm += 3;
1483 
1484             // We'll do one normal each iteration, but still use SSE
1485             for (size_t n = 0; n < numVertices; ++n)
1486             {
1487                 // normalise function
1488                 __m128 norm;
1489 
1490                 // load 3 floating-point normal values
1491                 // This loads into [0] and clears the rest
1492                 norm = _mm_load_ss(pNorm + 2);
1493                 // This loads into [2,3]. [1] is unused
1494                 norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0));
1495 
1496                 // Fill a 4-vec with vector length
1497                 // square
1498                 __m128 tmp = _mm_mul_ps(norm, norm);
1499                 // Add - for this we want this effect:
1500                 // orig   3 | 2 | 1 | 0
1501                 // add1   0 | 0 | 0 | 2
1502                 // add2   2 | 3 | 0 | 3
1503                 // This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused)
1504 
1505                 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2)));
1506                 // Add final combination & sqrt
1507                 // bottom 3 elements of l will have length, we don't care about 4
1508                 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3)));
1509                 // Then divide to normalise
1510                 norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp));
1511 
1512                 // Store back in the same place
1513                 _mm_storeh_pi((__m64*)(pNorm + 0), norm);
1514                 _mm_store_ss(pNorm + 2, norm);
1515 
1516                 // Skip to next vertex (3x normal components, 3x position components)
1517                 pNorm += 6;
1518 
1519 
1520             }
1521 
1522 
1523         }
1524     }
1525     //---------------------------------------------------------------------
concatenateAffineMatrices(const Affine3 & baseMatrix,const Affine3 * pSrcMat,Affine3 * pDstMat,size_t numMatrices)1526     void OptimisedUtilSSE::concatenateAffineMatrices(
1527         const Affine3& baseMatrix,
1528         const Affine3* pSrcMat,
1529         Affine3* pDstMat,
1530         size_t numMatrices)
1531     {
1532         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1533 
1534         assert(_isAlignedForSSE(pSrcMat));
1535         assert(_isAlignedForSSE(pDstMat));
1536 
1537         // Load base matrix, unaligned
1538         __m128 m0 = _mm_loadu_ps(baseMatrix[0]);
1539         __m128 m1 = _mm_loadu_ps(baseMatrix[1]);
1540         __m128 m2 = _mm_loadu_ps(baseMatrix[2]);
1541         __m128 m3 = _mm_loadu_ps(baseMatrix[3]);        // m3 should be equal to (0, 0, 0, 1)
1542 
1543         for (size_t i = 0; i < numMatrices; ++i)
1544         {
1545             // Load source matrix, aligned
1546             __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]);
1547             __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]);
1548             __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]);
1549 
1550             ++pSrcMat;
1551 
1552             __m128 t0, t1, t2, t3;
1553 
1554             // Concatenate matrix, and store results
1555 
1556             // Row 0
1557             t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0);
1558             t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1);
1559             t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2);
1560             t3 = _mm_mul_ps(m0, m3);    // Compiler should optimise this out of the loop
1561             __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3));
1562 
1563             // Row 1
1564             t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0);
1565             t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1);
1566             t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2);
1567             t3 = _mm_mul_ps(m1, m3);    // Compiler should optimise this out of the loop
1568             __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3));
1569 
1570             // Row 2
1571             t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0);
1572             t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1);
1573             t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2);
1574             t3 = _mm_mul_ps(m2, m3);    // Compiler should optimise this out of the loop
1575             __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3));
1576 
1577             ++pDstMat;
1578         }
1579     }
1580     //---------------------------------------------------------------------
calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1581     void OptimisedUtilSSE::calculateFaceNormals(
1582         const float *positions,
1583         const EdgeData::Triangle *triangles,
1584         Vector4 *faceNormals,
1585         size_t numTriangles)
1586     {
1587         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1588 
1589         assert(_isAlignedForSSE(faceNormals));
1590 
1591 // Load Vector3 as: (x, 0, y, z)
1592 #define __LOAD_VECTOR3(p)   _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1))
1593 
1594         // Mask used to changes sign of single precision floating point values.
1595         OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) =
1596         {
1597             0x80000000, 0x80000000, 0x80000000, 0x80000000,
1598         };
1599 
1600         size_t numIterations = numTriangles / 4;
1601         numTriangles &= 3;
1602 
1603         // Four triangles per-iteration
1604         for (size_t i = 0; i < numIterations; ++i)
1605         {
1606 
1607 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3)
1608 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3)                    \
1609             {                                                           \
1610                 __m128 v0 = __LOAD_VECTOR3(p0);     /* x0 -- y0 z0 */   \
1611                 __m128 v1 = __LOAD_VECTOR3(p1);     /* x1 -- y1 z1 */   \
1612                 __m128 v2 = __LOAD_VECTOR3(p2);     /* x2 -- y2 z2 */   \
1613                 __m128 v3 = __LOAD_VECTOR3(p3);     /* x3 -- y3 z3 */   \
1614                 __m128 t0, t1;                                          \
1615                                                                         \
1616                 t0 = _mm_unpacklo_ps(v0, v2);       /* x0 x2 -- -- */   \
1617                 t1 = _mm_unpacklo_ps(v1, v3);       /* x1 x3 -- -- */   \
1618                 x  = _mm_unpacklo_ps(t0, t1);       /* x0 x1 x2 x3 */   \
1619                                                                         \
1620                 t0 = _mm_unpackhi_ps(v0, v2);       /* y0 y2 z0 z2 */   \
1621                 t1 = _mm_unpackhi_ps(v1, v3);       /* y1 y3 z1 z3 */   \
1622                 y  = _mm_unpacklo_ps(t0, t1);       /* y0 y1 y2 y3 */   \
1623                 z  = _mm_unpackhi_ps(t0, t1);       /* z0 z1 z2 z3 */   \
1624             }
1625 
1626             __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2;
1627 
1628             // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz
1629             __LOAD_FOUR_VECTOR3(x0, y0, z0,
1630                 positions + triangles[0].vertIndex[0] * 3,
1631                 positions + triangles[1].vertIndex[0] * 3,
1632                 positions + triangles[2].vertIndex[0] * 3,
1633                 positions + triangles[3].vertIndex[0] * 3);
1634 
1635             // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz
1636             __LOAD_FOUR_VECTOR3(x1, y1, z1,
1637                 positions + triangles[0].vertIndex[1] * 3,
1638                 positions + triangles[1].vertIndex[1] * 3,
1639                 positions + triangles[2].vertIndex[1] * 3,
1640                 positions + triangles[3].vertIndex[1] * 3);
1641 
1642             // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz
1643             __LOAD_FOUR_VECTOR3(x2, y2, z2,
1644                 positions + triangles[0].vertIndex[2] * 3,
1645                 positions + triangles[1].vertIndex[2] * 3,
1646                 positions + triangles[2].vertIndex[2] * 3,
1647                 positions + triangles[3].vertIndex[2] * 3);
1648 
1649             triangles += 4;
1650 
1651             // Calculate triangle face normals
1652 
1653             // a = v1 - v0
1654             __m128 ax = _mm_sub_ps(x1, x0);
1655             __m128 ay = _mm_sub_ps(y1, y0);
1656             __m128 az = _mm_sub_ps(z1, z0);
1657 
1658             // b = v2 - v0
1659             __m128 bx = _mm_sub_ps(x2, x0);
1660             __m128 by = _mm_sub_ps(y2, y0);
1661             __m128 bz = _mm_sub_ps(z2, z0);
1662 
1663             // n = a cross b
1664             __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by));
1665             __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz));
1666             __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx));
1667 
1668             // w = - (n dot v0)
1669             __m128 nw = _mm_xor_ps(
1670                 __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0),
1671                 *(const __m128 *)&msSignMask);
1672 
1673             // Arrange to per-triangle face normal major format
1674             __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw);
1675 
1676             // Store results
1677             __MM_STORE_PS(&faceNormals[0].x, nx);
1678             __MM_STORE_PS(&faceNormals[1].x, ny);
1679             __MM_STORE_PS(&faceNormals[2].x, nz);
1680             __MM_STORE_PS(&faceNormals[3].x, nw);
1681             faceNormals += 4;
1682 
1683 #undef __LOAD_FOUR_VECTOR3
1684         }
1685 
1686         // Dealing with remaining triangles
1687         for (size_t j = 0; j < numTriangles; ++j)
1688         {
1689             // Load vertices of the triangle
1690             __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3);
1691             __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3);
1692             __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3);
1693             ++triangles;
1694 
1695             // Calculate face normal
1696 
1697             __m128 t0, t1;
1698 
1699             __m128 a = _mm_sub_ps(v1, v0);                      // ax 0 ay az
1700             __m128 b = _mm_sub_ps(v2, v0);                      // bx 0 by bz
1701             t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3));    // az 0 ax ay
1702             t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3));    // bz 0 bx by
1703             t0 = _mm_mul_ps(t0, b);                             // az*bx 0 ax*by ay*bz
1704             t1 = _mm_mul_ps(t1, a);                             // ax*bz 0 ay*bx az*by
1705 
1706             __m128 n = _mm_sub_ps(t0, t1);                      // ny 0  nz nx
1707 
1708             __m128 d = _mm_mul_ps(                              // dy 0  dz dx
1709                 _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n);
1710 
1711             n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps(               // nx ny nz -(dx+dy+dz)
1712                 _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)),     // nx ny nz 0
1713                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))),    // 0  0  0  dx
1714                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))),    // 0  0  0  dy
1715                 _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1)));    // 0  0  0  dz
1716 
1717             // Store result
1718             __MM_STORE_PS(&faceNormals->x, n);
1719             ++faceNormals;
1720         }
1721 
1722 #undef __LOAD_VECTOR3
1723     }
1724     //---------------------------------------------------------------------
calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1725     void OptimisedUtilSSE::calculateLightFacing(
1726         const Vector4& lightPos,
1727         const Vector4* faceNormals,
1728         char* lightFacings,
1729         size_t numFaces)
1730     {
1731         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
1732 
1733         assert(_isAlignedForSSE(faceNormals));
1734 
1735         // Map to convert 4-bits mask to 4 byte values
1736         static const char msMaskMapping[16][4] =
1737         {
1738             {0, 0, 0, 0},   {1, 0, 0, 0},   {0, 1, 0, 0},   {1, 1, 0, 0},
1739             {0, 0, 1, 0},   {1, 0, 1, 0},   {0, 1, 1, 0},   {1, 1, 1, 0},
1740             {0, 0, 0, 1},   {1, 0, 0, 1},   {0, 1, 0, 1},   {1, 1, 0, 1},
1741             {0, 0, 1, 1},   {1, 0, 1, 1},   {0, 1, 1, 1},   {1, 1, 1, 1},
1742         };
1743 
1744         __m128 n0, n1, n2, n3;
1745         __m128 t0, t1;
1746         __m128 dp;
1747         int bitmask;
1748 
1749         // Load light vector, unaligned
1750         __m128 lp = _mm_loadu_ps(&lightPos.x);
1751 
1752         // Perload zero to register for compare dot product values
1753         __m128 zero = _mm_setzero_ps();
1754 
1755         size_t numIterations = numFaces / 4;
1756         numFaces &= 3;
1757 
1758         // Four faces per-iteration
1759         for (size_t i = 0; i < numIterations; ++i)
1760         {
1761             // Load face normals, aligned
1762             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1763             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1764             n2 = __MM_LOAD_PS(&faceNormals[2].x);
1765             n3 = __MM_LOAD_PS(&faceNormals[3].x);
1766             faceNormals += 4;
1767 
1768             // Multiply by light vector
1769             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1770             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1771             n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1772             n3 = _mm_mul_ps(n3, lp);        // x3 y3 z3 w3
1773 
1774             // Horizontal add four vector values.
1775             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1776                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1777                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1778             t1 = _mm_add_ps(                                            // x2+z2 x3+z3 y2+w2 y3+w3
1779                 _mm_unpacklo_ps(n2, n3),    // x2 x3 y2 y3
1780                 _mm_unpackhi_ps(n2, n3));   // z2 z3 w2 w3
1781             dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp3
1782                 _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x3+z3
1783                 _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y3+w3
1784 
1785             // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps'
1786             // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch,
1787             // i.e. it's 2nd operand of the assembly instruction. And in fact
1788             // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped
1789             // in VC7.1.
1790             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1791 
1792             // Convert 4-bits mask to 4 bytes, and store results.
1793             /*
1794             *reinterpret_cast<uint32*>(lightFacings) =
1795                 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]);
1796                 */
1797             memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32));
1798 
1799 
1800             lightFacings += 4;
1801         }
1802 
1803         // Dealing with remaining faces
1804         switch (numFaces)
1805         {
1806         case 3:
1807             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1808             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1809             n2 = __MM_LOAD_PS(&faceNormals[2].x);
1810 
1811             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1812             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1813             n2 = _mm_mul_ps(n2, lp);        // x2 y2 z2 w2
1814 
1815             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1816                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1817                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1818             t1 = _mm_add_ps(                                            // x2+z2 x2+z2 y2+w2 y2+w2
1819                 _mm_unpacklo_ps(n2, n2),    // x2 x2 y2 y2
1820                 _mm_unpackhi_ps(n2, n2));   // z2 z2 w2 w2
1821             dp = _mm_add_ps(                                            // dp0 dp1 dp2 dp2
1822                 _mm_movelh_ps(t0, t1),      // x0+z0 x1+z1 x2+z2 x2+z2
1823                 _mm_movehl_ps(t1, t0));     // y0+w0 y1+w1 y2+w2 y2+w2
1824 
1825             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1826 
1827             lightFacings[0] = msMaskMapping[bitmask][0];
1828             lightFacings[1] = msMaskMapping[bitmask][1];
1829             lightFacings[2] = msMaskMapping[bitmask][2];
1830             break;
1831 
1832         case 2:
1833             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1834             n1 = __MM_LOAD_PS(&faceNormals[1].x);
1835 
1836             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1837             n1 = _mm_mul_ps(n1, lp);        // x1 y1 z1 w1
1838 
1839             t0 = _mm_add_ps(                                            // x0+z0 x1+z1 y0+w0 y1+w1
1840                 _mm_unpacklo_ps(n0, n1),    // x0 x1 y0 y1
1841                 _mm_unpackhi_ps(n0, n1));   // z0 z1 w0 w1
1842             dp = _mm_add_ps(                                            // dp0 dp1 dp0 dp1
1843                 _mm_movelh_ps(t0, t0),      // x0+z0 x1+z1 x0+z0 x1+z1
1844                 _mm_movehl_ps(t0, t0));     // y0+w0 y1+w1 y0+w0 y1+w1
1845 
1846             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1847 
1848             lightFacings[0] = msMaskMapping[bitmask][0];
1849             lightFacings[1] = msMaskMapping[bitmask][1];
1850             break;
1851 
1852         case 1:
1853             n0 = __MM_LOAD_PS(&faceNormals[0].x);
1854 
1855             n0 = _mm_mul_ps(n0, lp);        // x0 y0 z0 w0
1856 
1857             t0 = _mm_add_ps(                                            // x0+z0 x0+z0 y0+w0 y0+w0
1858                 _mm_unpacklo_ps(n0, n0),    // x0 x0 y0 y0
1859                 _mm_unpackhi_ps(n0, n0));   // z0 z0 w0 w0
1860             dp = _mm_add_ps(                                            // dp0 dp0 dp0 dp0
1861                 _mm_movelh_ps(t0, t0),      // x0+z0 x0+z0 x0+z0 x0+z0
1862                 _mm_movehl_ps(t0, t0));     // y0+w0 y0+w0 y0+w0 y0+w0
1863 
1864             bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero));
1865 
1866             lightFacings[0] = msMaskMapping[bitmask][0];
1867             break;
1868         }
1869     }
1870     //---------------------------------------------------------------------
1871     // Template to extrude vertices for directional light.
1872     template <bool srcAligned, bool destAligned>
1873     struct ExtrudeVertices_SSE_DirectionalLight
1874     {
applyOgre::ExtrudeVertices_SSE_DirectionalLight1875         static void apply(
1876             const Vector4& lightPos,
1877             Real extrudeDist,
1878             const float* pSrcPos,
1879             float* pDestPos,
1880             size_t numVertices)
1881         {
1882             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1883             typedef SSEMemoryAccessor<destAligned> DestAccessor;
1884 
1885             // Directional light, extrusion is along light direction
1886 
1887             // Load light vector, unaligned
1888             __m128 lp = _mm_loadu_ps(&lightPos.x);
1889 
1890             // Calculate extrusion direction, note that we use inverted direction here
1891             // for eliminate an extra negative instruction, we'll compensate for that
1892             // by use subtract instruction instead later.
1893             __m128 tmp = _mm_mul_ps(lp, lp);
1894             tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp));
1895             // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
1896             tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist));
1897             __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0));               // X Y Z -
1898 
1899             // Prepare extrude direction for extruding 4 vertices parallelly
1900             __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0));   // X Y Z X
1901             __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1));   // Y Z X Y
1902             __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2));   // Z X Y Z
1903 
1904             __m128 s0, s1, s2;
1905             __m128 d0, d1, d2;
1906 
1907             size_t numIterations = numVertices / 4;
1908             numVertices &= 3;
1909 
1910             // Extruding 4 vertices per-iteration
1911             for (size_t i = 0; i < numIterations; ++i)
1912             {
1913                 s0 = SrcAccessor::load(pSrcPos + 0);
1914                 s1 = SrcAccessor::load(pSrcPos + 4);
1915                 s2 = SrcAccessor::load(pSrcPos + 8);
1916                 pSrcPos += 12;
1917 
1918                 // The extrusion direction is inverted, use subtract instruction here
1919                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1920                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1921                 d2 = _mm_sub_ps(s2, dir2);                      // Z2 X3 Y3 Z3
1922 
1923                 DestAccessor::store(pDestPos + 0, d0);
1924                 DestAccessor::store(pDestPos + 4, d1);
1925                 DestAccessor::store(pDestPos + 8, d2);
1926                 pDestPos += 12;
1927             }
1928 
1929             // Dealing with remaining vertices
1930             switch (numVertices)
1931             {
1932             case 3:
1933                 // 9 floating-point values
1934                 s0 = SrcAccessor::load(pSrcPos + 0);
1935                 s1 = SrcAccessor::load(pSrcPos + 4);
1936                 s2 = _mm_load_ss(pSrcPos + 8);
1937 
1938                 // The extrusion direction is inverted, use subtract instruction here
1939                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1940                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 X2 Y2
1941                 d2 = _mm_sub_ss(s2, dir2);                      // Z2 -- -- --
1942 
1943                 DestAccessor::store(pDestPos + 0, d0);
1944                 DestAccessor::store(pDestPos + 4, d1);
1945                 _mm_store_ss(pDestPos + 8, d2);
1946                 break;
1947 
1948             case 2:
1949                 // 6 floating-point values
1950                 s0 = SrcAccessor::load(pSrcPos + 0);
1951                 s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here
1952 
1953                 // The extrusion direction is inverted, use subtract instruction here
1954                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 Z0 X1
1955                 d1 = _mm_sub_ps(s1, dir1);                      // Y1 Z1 -- --
1956 
1957                 DestAccessor::store(pDestPos + 0, d0);
1958                 _mm_storel_pi((__m64*)(pDestPos + 4), d1);
1959                 break;
1960 
1961             case 1:
1962                 // 3 floating-point values
1963                 s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here
1964                 s1 = _mm_load_ss(pSrcPos + 2);
1965 
1966                 // The extrusion direction is inverted, use subtract instruction here
1967                 d0 = _mm_sub_ps(s0, dir0);                      // X0 Y0 -- --
1968                 d1 = _mm_sub_ss(s1, dir2);                      // Z0 -- -- --
1969 
1970                 _mm_storel_pi((__m64*)(pDestPos + 0), d0);
1971                 _mm_store_ss(pDestPos + 2, d1);
1972                 break;
1973             }
1974         }
1975     };
1976     //---------------------------------------------------------------------
1977     // Template to extrude vertices for point light.
1978     template <bool srcAligned, bool destAligned>
1979     struct ExtrudeVertices_SSE_PointLight
1980     {
applyOgre::ExtrudeVertices_SSE_PointLight1981         static void apply(
1982             const Vector4& lightPos,
1983             Real extrudeDist,
1984             const float* pSrcPos,
1985             float* pDestPos,
1986             size_t numVertices)
1987         {
1988             typedef SSEMemoryAccessor<srcAligned> SrcAccessor;
1989             typedef SSEMemoryAccessor<destAligned> DestAccessor;
1990 
1991             // Point light, will calculate extrusion direction for every vertex
1992 
1993             // Load light vector, unaligned
1994             __m128 lp = _mm_loadu_ps(&lightPos.x);
1995 
1996             // Load extrude distance
1997             __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist);
1998 
1999             size_t numIterations = numVertices / 4;
2000             numVertices &= 3;
2001 
2002             // Extruding 4 vertices per-iteration
2003             for (size_t i = 0; i < numIterations; ++i)
2004             {
2005                 // Load source positions
2006                 __m128 s0 = SrcAccessor::load(pSrcPos + 0);     // x0 y0 z0 x1
2007                 __m128 s1 = SrcAccessor::load(pSrcPos + 4);     // y1 z1 x2 y2
2008                 __m128 s2 = SrcAccessor::load(pSrcPos + 8);     // z2 x3 y3 z3
2009                 pSrcPos += 12;
2010 
2011                 // Arrange to 3x4 component-major for batches calculate
2012                 __MM_TRANSPOSE4x3_PS(s0, s1, s2);
2013 
2014                 // Calculate unnormalised extrusion direction
2015                 __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3
2016                 __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3
2017                 __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3
2018 
2019                 // Normalise extrusion direction and multiply by extrude distance
2020                 __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz);
2021                 tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4);
2022                 dx = _mm_mul_ps(dx, tmp);
2023                 dy = _mm_mul_ps(dy, tmp);
2024                 dz = _mm_mul_ps(dz, tmp);
2025 
2026                 // Calculate extruded positions
2027                 __m128 d0 = _mm_add_ps(dx, s0);
2028                 __m128 d1 = _mm_add_ps(dy, s1);
2029                 __m128 d2 = _mm_add_ps(dz, s2);
2030 
2031                 // Arrange back to 4x3 continuous format for store results
2032                 __MM_TRANSPOSE3x4_PS(d0, d1, d2);
2033 
2034                 // Store extruded positions
2035                 DestAccessor::store(pDestPos + 0, d0);
2036                 DestAccessor::store(pDestPos + 4, d1);
2037                 DestAccessor::store(pDestPos + 8, d2);
2038                 pDestPos += 12;
2039             }
2040 
2041             // Dealing with remaining vertices
2042             for (size_t j = 0; j  < numVertices; ++j)
2043             {
2044                 // Load source position
2045                 __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z
2046                 pSrcPos += 3;
2047 
2048                 // Calculate unnormalised extrusion direction
2049                 __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z
2050 
2051                 // Normalise extrusion direction and multiply by extrude distance
2052                 __m128 tmp = _mm_mul_ps(dir, dir);
2053                 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3));
2054                 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead
2055                 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4);
2056                 dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0));
2057 
2058                 // Calculate extruded position
2059                 __m128 dst = _mm_add_ps(dir, src);
2060 
2061                 // Store extruded position
2062                 _mm_store_ss(pDestPos + 0, dst);
2063                 _mm_storeh_pi((__m64*)(pDestPos + 1), dst);
2064                 pDestPos += 3;
2065             }
2066         }
2067     };
2068     //---------------------------------------------------------------------
extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)2069     void OptimisedUtilSSE::extrudeVertices(
2070         const Vector4& lightPos,
2071         Real extrudeDist,
2072         const float* pSrcPos,
2073         float* pDestPos,
2074         size_t numVertices)
2075     {
2076         __OGRE_CHECK_STACK_ALIGNED_FOR_SSE();
2077 
2078         // Note: Since pDestPos is following tail of pSrcPos, we can't assume
2079         // it's aligned to SIMD alignment properly, so must check for it here.
2080         //
2081         // TODO: Add extra vertex to the vertex buffer for make sure pDestPos
2082         // aligned same as pSrcPos.
2083         //
2084 
2085         // We are use SSE reciprocal square root directly while calculating
2086         // extrusion direction, since precision loss not that important here.
2087         //
2088         if (lightPos.w == 0.0f)
2089         {
2090             if (_isAlignedForSSE(pSrcPos))
2091             {
2092                 if (_isAlignedForSSE(pDestPos))
2093                     ExtrudeVertices_SSE_DirectionalLight<true, true>::apply(
2094                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2095                 else
2096                     ExtrudeVertices_SSE_DirectionalLight<true, false>::apply(
2097                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2098             }
2099             else
2100             {
2101                 if (_isAlignedForSSE(pDestPos))
2102                     ExtrudeVertices_SSE_DirectionalLight<false, true>::apply(
2103                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2104                 else
2105                     ExtrudeVertices_SSE_DirectionalLight<false, false>::apply(
2106                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2107             }
2108         }
2109         else
2110         {
2111             assert(lightPos.w == 1.0f);
2112 
2113             if (_isAlignedForSSE(pSrcPos))
2114             {
2115                 if (_isAlignedForSSE(pDestPos))
2116                     ExtrudeVertices_SSE_PointLight<true, true>::apply(
2117                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2118                 else
2119                     ExtrudeVertices_SSE_PointLight<true, false>::apply(
2120                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2121             }
2122             else
2123             {
2124                 if (_isAlignedForSSE(pDestPos))
2125                     ExtrudeVertices_SSE_PointLight<false, true>::apply(
2126                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2127                 else
2128                     ExtrudeVertices_SSE_PointLight<false, false>::apply(
2129                         lightPos, extrudeDist, pSrcPos, pDestPos, numVertices);
2130             }
2131         }
2132     }
2133     //---------------------------------------------------------------------
2134     //---------------------------------------------------------------------
2135     //---------------------------------------------------------------------
2136     extern OptimisedUtil* _getOptimisedUtilSSE(void);
_getOptimisedUtilSSE(void)2137     extern OptimisedUtil* _getOptimisedUtilSSE(void)
2138     {
2139         static OptimisedUtilSSE msOptimisedUtilSSE;
2140 #if defined(__OGRE_SIMD_ALIGN_STACK)
2141         static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE);
2142         return &msOptimisedUtilWithStackAlign;
2143 #else
2144         return &msOptimisedUtilSSE;
2145 #endif
2146     }
2147 
2148 }
2149 
2150 #endif // __OGRE_HAVE_SSE
2151