1 /* 2 ----------------------------------------------------------------------------- 3 This source file is part of OGRE 4 (Object-oriented Graphics Rendering Engine) 5 For the latest info, see http://www.ogre3d.org/ 6 7 Copyright (c) 2000-2013 Torus Knot Software Ltd 8 9 Permission is hereby granted, free of charge, to any person obtaining a copy 10 of this software and associated documentation files (the "Software"), to deal 11 in the Software without restriction, including without limitation the rights 12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 copies of the Software, and to permit persons to whom the Software is 14 furnished to do so, subject to the following conditions: 15 16 The above copyright notice and this permission notice shall be included in 17 all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25 THE SOFTWARE. 26 ----------------------------------------------------------------------------- 27 */ 28 #include "OgreStableHeaders.h" 29 30 #include "OgreOptimisedUtil.h" 31 #include "OgrePlatformInformation.h" 32 33 #if __OGRE_HAVE_SSE 34 35 #include "OgreMatrix4.h" 36 37 // Should keep this includes at latest to avoid potential "xmmintrin.h" included by 38 // other header file on some platform for some reason. 39 #include "OgreSIMDHelper.h" 40 41 // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's 42 // impossible when compile with gcc, due SSE instructions can only 43 // enable/disable at file level. 44 45 //------------------------------------------------------------------------- 46 // 47 // The routines implemented in this file are performance oriented, 48 // which means saving every penny as possible. This requirement might 49 // break some C++/STL-rules. 50 // 51 // 52 // Some rules I'd like to respects: 53 // 54 // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because 55 // it can saving one byte of binary code :) 56 // 2. Use add/sub instead of mul. 57 // 3. Eliminate prolog code of function call. 58 // 59 // The last, anything recommended by Intel Optimization Reference Manual. 60 // 61 //------------------------------------------------------------------------- 62 63 // Use unrolled SSE version when vertices exceeds this limit 64 #define OGRE_SSE_SKINNING_UNROLL_VERTICES 16 65 66 namespace Ogre { 67 68 //------------------------------------------------------------------------- 69 // Local classes 70 //------------------------------------------------------------------------- 71 72 /** SSE implementation of OptimisedUtil. 73 @note 74 Don't use this class directly, use OptimisedUtil instead. 75 */ 76 class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil 77 { 78 protected: 79 /// Do we prefer to use a general SSE version for position/normal shared buffers? 80 bool mPreferGeneralVersionForSharedBuffers; 81 82 public: 83 /// Constructor 84 OptimisedUtilSSE(void); 85 86 /// @copydoc OptimisedUtil::softwareVertexSkinning 87 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning( 88 const float *srcPosPtr, float *destPosPtr, 89 const float *srcNormPtr, float *destNormPtr, 90 const float *blendWeightPtr, const unsigned char* blendIndexPtr, 91 const Matrix4* const* blendMatrices, 92 size_t srcPosStride, size_t destPosStride, 93 size_t srcNormStride, size_t destNormStride, 94 size_t blendWeightStride, size_t blendIndexStride, 95 size_t numWeightsPerVertex, 96 size_t numVertices); 97 98 /// @copydoc OptimisedUtil::softwareVertexMorph 99 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph( 100 Real t, 101 const float *srcPos1, const float *srcPos2, 102 float *dstPos, 103 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 104 size_t numVertices, 105 bool morphNormals); 106 107 /// @copydoc OptimisedUtil::concatenateAffineMatrices 108 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices( 109 const Matrix4& baseMatrix, 110 const Matrix4* srcMatrices, 111 Matrix4* dstMatrices, 112 size_t numMatrices); 113 114 /// @copydoc OptimisedUtil::calculateFaceNormals 115 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals( 116 const float *positions, 117 const EdgeData::Triangle *triangles, 118 Vector4 *faceNormals, 119 size_t numTriangles); 120 121 /// @copydoc OptimisedUtil::calculateLightFacing 122 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing( 123 const Vector4& lightPos, 124 const Vector4* faceNormals, 125 char* lightFacings, 126 size_t numFaces); 127 128 /// @copydoc OptimisedUtil::extrudeVertices 129 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices( 130 const Vector4& lightPos, 131 Real extrudeDist, 132 const float* srcPositions, 133 float* destPositions, 134 size_t numVertices); 135 }; 136 137 #if defined(__OGRE_SIMD_ALIGN_STACK) 138 /** Stack-align implementation of OptimisedUtil. 139 @remarks 140 User code compiled by icc and gcc might not align stack 141 properly, we need ensure stack align to a 16-bytes boundary 142 when execute SSE function. 143 @par 144 We implemeted as align stack following a virtual function call, 145 then should guarantee call instruction are used instead of inline 146 underlying function body here (which might causing problem). 147 @note 148 Don't use this class directly, use OptimisedUtil instead. 149 */ 150 class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil 151 { 152 protected: 153 /// The actual implementation 154 OptimisedUtil* mImpl; 155 156 public: 157 /// Constructor OptimisedUtilWithStackAlign(OptimisedUtil * impl)158 OptimisedUtilWithStackAlign(OptimisedUtil* impl) 159 : mImpl(impl) 160 { 161 } 162 163 /// @copydoc OptimisedUtil::softwareVertexSkinning softwareVertexSkinning(const float * srcPosPtr,float * destPosPtr,const float * srcNormPtr,float * destNormPtr,const float * blendWeightPtr,const unsigned char * blendIndexPtr,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)164 virtual void softwareVertexSkinning( 165 const float *srcPosPtr, float *destPosPtr, 166 const float *srcNormPtr, float *destNormPtr, 167 const float *blendWeightPtr, const unsigned char* blendIndexPtr, 168 const Matrix4* const* blendMatrices, 169 size_t srcPosStride, size_t destPosStride, 170 size_t srcNormStride, size_t destNormStride, 171 size_t blendWeightStride, size_t blendIndexStride, 172 size_t numWeightsPerVertex, 173 size_t numVertices) 174 { 175 __OGRE_SIMD_ALIGN_STACK(); 176 177 mImpl->softwareVertexSkinning( 178 srcPosPtr, destPosPtr, 179 srcNormPtr, destNormPtr, 180 blendWeightPtr, blendIndexPtr, 181 blendMatrices, 182 srcPosStride, destPosStride, 183 srcNormStride, destNormStride, 184 blendWeightStride, blendIndexStride, 185 numWeightsPerVertex, 186 numVertices); 187 } 188 189 /// @copydoc OptimisedUtil::softwareVertexMorph softwareVertexMorph(Real t,const float * srcPos1,const float * srcPos2,float * dstPos,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)190 virtual void softwareVertexMorph( 191 Real t, 192 const float *srcPos1, const float *srcPos2, 193 float *dstPos, 194 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 195 size_t numVertices, 196 bool morphNormals) 197 { 198 __OGRE_SIMD_ALIGN_STACK(); 199 200 mImpl->softwareVertexMorph( 201 t, 202 srcPos1, srcPos2, 203 dstPos, 204 pos1VSize, pos2VSize, dstVSize, 205 numVertices, 206 morphNormals); 207 } 208 209 /// @copydoc OptimisedUtil::concatenateAffineMatrices concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * srcMatrices,Matrix4 * dstMatrices,size_t numMatrices)210 virtual void concatenateAffineMatrices( 211 const Matrix4& baseMatrix, 212 const Matrix4* srcMatrices, 213 Matrix4* dstMatrices, 214 size_t numMatrices) 215 { 216 __OGRE_SIMD_ALIGN_STACK(); 217 218 mImpl->concatenateAffineMatrices( 219 baseMatrix, 220 srcMatrices, 221 dstMatrices, 222 numMatrices); 223 } 224 225 /// @copydoc OptimisedUtil::calculateFaceNormals calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)226 virtual void calculateFaceNormals( 227 const float *positions, 228 const EdgeData::Triangle *triangles, 229 Vector4 *faceNormals, 230 size_t numTriangles) 231 { 232 __OGRE_SIMD_ALIGN_STACK(); 233 234 mImpl->calculateFaceNormals( 235 positions, 236 triangles, 237 faceNormals, 238 numTriangles); 239 } 240 241 /// @copydoc OptimisedUtil::calculateLightFacing calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)242 virtual void calculateLightFacing( 243 const Vector4& lightPos, 244 const Vector4* faceNormals, 245 char* lightFacings, 246 size_t numFaces) 247 { 248 __OGRE_SIMD_ALIGN_STACK(); 249 250 mImpl->calculateLightFacing( 251 lightPos, 252 faceNormals, 253 lightFacings, 254 numFaces); 255 } 256 257 /// @copydoc OptimisedUtil::extrudeVertices extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * srcPositions,float * destPositions,size_t numVertices)258 virtual void extrudeVertices( 259 const Vector4& lightPos, 260 Real extrudeDist, 261 const float* srcPositions, 262 float* destPositions, 263 size_t numVertices) 264 { 265 __OGRE_SIMD_ALIGN_STACK(); 266 267 mImpl->extrudeVertices( 268 lightPos, 269 extrudeDist, 270 srcPositions, 271 destPositions, 272 numVertices); 273 } 274 }; 275 #endif // !defined(__OGRE_SIMD_ALIGN_STACK) 276 277 //--------------------------------------------------------------------- 278 // Some useful macro for collapse matrices. 279 //--------------------------------------------------------------------- 280 281 #define __LOAD_MATRIX(row0, row1, row2, pMatrix) \ 282 { \ 283 row0 = __MM_LOAD_PS((*pMatrix)[0]); \ 284 row1 = __MM_LOAD_PS((*pMatrix)[1]); \ 285 row2 = __MM_LOAD_PS((*pMatrix)[2]); \ 286 } 287 288 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \ 289 { \ 290 row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \ 291 row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \ 292 row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \ 293 } 294 295 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ 296 { \ 297 row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight); \ 298 row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight); \ 299 row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight); \ 300 } 301 302 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ 303 { \ 304 row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \ 305 row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \ 306 row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \ 307 } 308 309 //--------------------------------------------------------------------- 310 // The following macros request variables declared by caller. 311 // 312 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy. 313 //--------------------------------------------------------------------- 314 315 /** Collapse one-weighted matrix. 316 Eliminated multiply by weight since the weight should be equal to one always 317 */ 318 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 319 { \ 320 pMatrix0 = blendMatrices[pIndices[0]]; \ 321 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ 322 } 323 324 /** Collapse two-weighted matrix. 325 Based on the fact that accumulated weights are equal to one, by use lerp, 326 replaced two multiplies and one additive with one multiplie and two additives. 327 */ 328 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 329 { \ 330 weight = _mm_load_ps1(pWeights + 1); \ 331 pMatrix0 = ppMatrices[pIndices[0]]; \ 332 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ 333 pMatrix1 = ppMatrices[pIndices[1]]; \ 334 __LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \ 335 } 336 337 /** Collapse three-weighted matrix. 338 */ 339 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 340 { \ 341 weight = _mm_load_ps1(pWeights + 0); \ 342 pMatrix0 = ppMatrices[pIndices[0]]; \ 343 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ 344 weight = _mm_load_ps1(pWeights + 1); \ 345 pMatrix1 = ppMatrices[pIndices[1]]; \ 346 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ 347 weight = _mm_load_ps1(pWeights + 2); \ 348 pMatrix2 = ppMatrices[pIndices[2]]; \ 349 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ 350 } 351 352 /** Collapse four-weighted matrix. 353 */ 354 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 355 { \ 356 /* Load four blend weights at one time, they will be shuffled later */ \ 357 weights = _mm_loadu_ps(pWeights); \ 358 \ 359 pMatrix0 = ppMatrices[pIndices[0]]; \ 360 weight = __MM_SELECT(weights, 0); \ 361 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ 362 pMatrix1 = ppMatrices[pIndices[1]]; \ 363 weight = __MM_SELECT(weights, 1); \ 364 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ 365 pMatrix2 = ppMatrices[pIndices[2]]; \ 366 weight = __MM_SELECT(weights, 2); \ 367 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ 368 pMatrix3 = ppMatrices[pIndices[3]]; \ 369 weight = __MM_SELECT(weights, 3); \ 370 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \ 371 } 372 373 374 375 //--------------------------------------------------------------------- 376 // Collapse a matrix at one time. The collapsed matrix are weighted by 377 // blend-weights, and then can use to transform corresponding vertex directly. 378 // 379 // I'd like use inline function instead of macro here, but I also want to 380 // ensure compiler integrate this code into its callers (release build at 381 // least), doesn't matter about specific compile options. Inline function 382 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call 383 // when implemented as inline function, even if compile with "-O3" option. 384 // 385 #define _collapseOneMatrix( \ 386 m00, m01, m02, \ 387 pBlendWeight, pBlendIndex, \ 388 blendMatrices, \ 389 blendWeightStride, blendIndexStride, \ 390 numWeightsPerVertex) \ 391 { \ 392 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ 393 /* generate wrong code here!!! */ \ 394 const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ 395 __m128 weight, weights; \ 396 \ 397 switch (numWeightsPerVertex) \ 398 { \ 399 default: /* Just in case and make compiler happy */ \ 400 case 1: \ 401 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ 402 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 403 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 404 break; \ 405 \ 406 case 2: \ 407 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ 408 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 409 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 410 break; \ 411 \ 412 case 3: \ 413 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ 414 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 415 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 416 break; \ 417 \ 418 case 4: \ 419 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ 420 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 421 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 422 break; \ 423 } \ 424 } 425 426 //--------------------------------------------------------------------- 427 // Collapse four matrices at one time. The collapsed matrix are weighted by 428 // blend-weights, and then can use to transform corresponding vertex directly. 429 // 430 // I'd like use inline function instead of macro here, but I also want to 431 // ensure compiler integrate this code into its callers (release build at 432 // least), doesn't matter about specific compile options. Inline function 433 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call 434 // when implemented as inline function, even if compile with "-O3" option. 435 // 436 #define _collapseFourMatrices( \ 437 m00, m01, m02, \ 438 m10, m11, m12, \ 439 m20, m21, m22, \ 440 m30, m31, m32, \ 441 pBlendWeight, pBlendIndex, \ 442 blendMatrices, \ 443 blendWeightStride, blendIndexStride, \ 444 numWeightsPerVertex) \ 445 { \ 446 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ 447 /* generate wrong code here!!! */ \ 448 const Matrix4* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ 449 __m128 weight, weights; \ 450 \ 451 switch (numWeightsPerVertex) \ 452 { \ 453 default: /* Just in case and make compiler happy */ \ 454 case 1: \ 455 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ 456 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 457 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 458 __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \ 459 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 460 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 461 __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \ 462 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 463 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 464 __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \ 465 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 466 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 467 break; \ 468 \ 469 case 2: \ 470 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ 471 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 472 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 473 __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \ 474 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 475 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 476 __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \ 477 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 478 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 479 __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \ 480 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 481 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 482 break; \ 483 \ 484 case 3: \ 485 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ 486 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 487 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 488 __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \ 489 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 490 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 491 __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \ 492 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 493 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 494 __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \ 495 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 496 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 497 break; \ 498 \ 499 case 4: \ 500 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ 501 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 502 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 503 __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \ 504 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 505 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 506 __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \ 507 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 508 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 509 __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \ 510 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 511 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 512 break; \ 513 } \ 514 } 515 516 517 //--------------------------------------------------------------------- 518 // General SSE version skinning positions, and optional skinning normals. softwareVertexSkinning_SSE_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)519 static void softwareVertexSkinning_SSE_General( 520 const float *pSrcPos, float *pDestPos, 521 const float *pSrcNorm, float *pDestNorm, 522 const float *pBlendWeight, const unsigned char* pBlendIndex, 523 const Matrix4* const* blendMatrices, 524 size_t srcPosStride, size_t destPosStride, 525 size_t srcNormStride, size_t destNormStride, 526 size_t blendWeightStride, size_t blendIndexStride, 527 size_t numWeightsPerVertex, 528 size_t numVertices) 529 { 530 for (size_t i = 0; i < numVertices; ++i) 531 { 532 // Collapse matrices 533 __m128 m00, m01, m02; 534 _collapseOneMatrix( 535 m00, m01, m02, 536 pBlendWeight, pBlendIndex, 537 blendMatrices, 538 blendWeightStride, blendIndexStride, 539 numWeightsPerVertex); 540 541 // Advance blend weight and index pointers 542 advanceRawPointer(pBlendWeight, blendWeightStride); 543 advanceRawPointer(pBlendIndex, blendIndexStride); 544 545 //------------------------------------------------------------------ 546 547 // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y 548 __m128 m03 = _mm_setzero_ps(); 549 __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01); 550 551 //------------------------------------------------------------------ 552 // Transform position 553 //------------------------------------------------------------------ 554 555 __m128 s0, s1, s2; 556 557 // Load source position 558 s0 = _mm_load_ps1(pSrcPos + 0); 559 s1 = _mm_load_ps1(pSrcPos + 1); 560 s2 = _mm_load_ps1(pSrcPos + 2); 561 562 // Transform by collapsed matrix 563 __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2); // z 0 x y 564 565 // Store blended position, no aligned requirement 566 _mm_storeh_pi((__m64*)pDestPos, accumPos); 567 _mm_store_ss(pDestPos+2, accumPos); 568 569 // Advance source and target position pointers 570 advanceRawPointer(pSrcPos, srcPosStride); 571 advanceRawPointer(pDestPos, destPosStride); 572 573 //------------------------------------------------------------------ 574 // Optional blend normal 575 //------------------------------------------------------------------ 576 577 if (pSrcNorm) 578 { 579 // Load source normal 580 s0 = _mm_load_ps1(pSrcNorm + 0); 581 s1 = _mm_load_ps1(pSrcNorm + 1); 582 s2 = _mm_load_ps1(pSrcNorm + 2); 583 584 // Transform by collapsed matrix 585 __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2); // z 0 x y 586 587 // Normalise normal 588 __m128 tmp = _mm_mul_ps(accumNorm, accumNorm); // z^2 0 x^2 y^2 589 tmp = __MM_ACCUM3_PS(tmp, 590 _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)), // x^2 0 y^2 z^2 591 _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3))); // y^2 0 z^2 x^2 592 // Note: zero divided here, but neglectable 593 tmp = __MM_RSQRT_PS(tmp); 594 accumNorm = _mm_mul_ps(accumNorm, tmp); 595 596 // Store blended normal, no aligned requirement 597 _mm_storeh_pi((__m64*)pDestNorm, accumNorm); 598 _mm_store_ss(pDestNorm+2, accumNorm); 599 600 // Advance source and target normal pointers 601 advanceRawPointer(pSrcNorm, srcNormStride); 602 advanceRawPointer(pDestNorm, destNormStride); 603 } 604 } 605 } 606 //--------------------------------------------------------------------- 607 // Special SSE version skinning shared buffers of position and normal, 608 // and the buffer are packed. 609 template <bool srcAligned, bool destAligned> 610 struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed 611 { applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed612 static void apply( 613 const float* pSrc, float* pDest, 614 const float* pBlendWeight, const unsigned char* pBlendIndex, 615 const Matrix4* const* blendMatrices, 616 size_t blendWeightStride, size_t blendIndexStride, 617 size_t numWeightsPerVertex, 618 size_t numIterations) 619 { 620 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 621 typedef SSEMemoryAccessor<destAligned> DestAccessor; 622 623 // Blending 4 vertices per-iteration 624 for (size_t i = 0; i < numIterations; ++i) 625 { 626 // Collapse matrices 627 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 628 _collapseFourMatrices( 629 m00, m01, m02, 630 m10, m11, m12, 631 m20, m21, m22, 632 m30, m31, m32, 633 pBlendWeight, pBlendIndex, 634 blendMatrices, 635 blendWeightStride, blendIndexStride, 636 numWeightsPerVertex); 637 638 // Advance 4 vertices 639 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 640 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 641 642 //------------------------------------------------------------------ 643 // Transform position/normals 644 //------------------------------------------------------------------ 645 646 __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5; 647 __m128 t0, t1, t2, t3, t4, t5; 648 649 // Load source position/normals 650 s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0 651 s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1 652 s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1 653 s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2 654 s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3 655 s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3 656 657 // Rearrange to component-major for batches calculate. 658 659 t0 = _mm_unpacklo_ps(s0, s3); // px0 px2 py0 py2 660 t1 = _mm_unpackhi_ps(s0, s3); // pz0 pz2 nx0 nx2 661 t2 = _mm_unpacklo_ps(s1, s4); // ny0 ny2 nz0 nz2 662 t3 = _mm_unpackhi_ps(s1, s4); // px1 px3 py1 py3 663 t4 = _mm_unpacklo_ps(s2, s5); // pz1 pz3 nx1 nx3 664 t5 = _mm_unpackhi_ps(s2, s5); // ny1 ny3 nz1 nz3 665 666 s0 = _mm_unpacklo_ps(t0, t3); // px0 px1 px2 px3 667 s1 = _mm_unpackhi_ps(t0, t3); // py0 py1 py2 py3 668 s2 = _mm_unpacklo_ps(t1, t4); // pz0 pz1 pz2 pz3 669 s3 = _mm_unpackhi_ps(t1, t4); // nx0 nx1 nx2 nx3 670 s4 = _mm_unpacklo_ps(t2, t5); // ny0 ny1 ny2 ny3 671 s5 = _mm_unpackhi_ps(t2, t5); // nz0 nz1 nz2 nz3 672 673 // Transform by collapsed matrix 674 675 // Shuffle row 0 of four collapsed matrices for calculate X component 676 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 677 678 // Transform X components 679 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3 680 d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3 681 682 // Shuffle row 1 of four collapsed matrices for calculate Y component 683 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 684 685 // Transform Y components 686 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3 687 d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3 688 689 // Shuffle row 2 of four collapsed matrices for calculate Z component 690 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 691 692 // Transform Z components 693 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3 694 d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3 695 696 // Normalise normals 697 __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5); 698 tmp = __MM_RSQRT_PS(tmp); 699 d3 = _mm_mul_ps(d3, tmp); 700 d4 = _mm_mul_ps(d4, tmp); 701 d5 = _mm_mul_ps(d5, tmp); 702 703 // Arrange back to continuous format for store results 704 705 t0 = _mm_unpacklo_ps(d0, d1); // PX0 PY0 PX1 PY1 706 t1 = _mm_unpackhi_ps(d0, d1); // PX2 PY2 PX3 PY3 707 t2 = _mm_unpacklo_ps(d2, d3); // PZ0 NX0 PZ1 NX1 708 t3 = _mm_unpackhi_ps(d2, d3); // PZ2 NX2 PZ3 NX3 709 t4 = _mm_unpacklo_ps(d4, d5); // NY0 NZ0 NY1 NZ1 710 t5 = _mm_unpackhi_ps(d4, d5); // NY2 NZ2 NY3 NZ3 711 712 d0 = _mm_movelh_ps(t0, t2); // PX0 PY0 PZ0 NX0 713 d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0)); // NY0 NZ0 PX1 PY1 714 d2 = _mm_movehl_ps(t4, t2); // PZ1 NX1 NY1 NZ1 715 d3 = _mm_movelh_ps(t1, t3); // PX2 PY2 PZ2 NX2 716 d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0)); // NY2 NZ2 PX3 PY3 717 d5 = _mm_movehl_ps(t5, t3); // PZ3 NX3 NY3 NZ3 718 719 // Store blended position/normals 720 DestAccessor::store(pDest + 0, d0); 721 DestAccessor::store(pDest + 4, d1); 722 DestAccessor::store(pDest + 8, d2); 723 DestAccessor::store(pDest + 12, d3); 724 DestAccessor::store(pDest + 16, d4); 725 DestAccessor::store(pDest + 20, d5); 726 727 // Advance 4 vertices 728 pSrc += 4 * (3 + 3); 729 pDest += 4 * (3 + 3); 730 } 731 } 732 }; softwareVertexSkinning_SSE_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)733 static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed( 734 const float* pSrcPos, float* pDestPos, 735 const float* pBlendWeight, const unsigned char* pBlendIndex, 736 const Matrix4* const* blendMatrices, 737 size_t blendWeightStride, size_t blendIndexStride, 738 size_t numWeightsPerVertex, 739 size_t numIterations) 740 { 741 // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex 742 743 // Instantiating two version only, since other alignment combinations are not that important. 744 if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos)) 745 { 746 SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply( 747 pSrcPos, pDestPos, 748 pBlendWeight, pBlendIndex, 749 blendMatrices, 750 blendWeightStride, blendIndexStride, 751 numWeightsPerVertex, 752 numIterations); 753 } 754 else 755 { 756 SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply( 757 pSrcPos, pDestPos, 758 pBlendWeight, pBlendIndex, 759 blendMatrices, 760 blendWeightStride, blendIndexStride, 761 numWeightsPerVertex, 762 numIterations); 763 } 764 } 765 //--------------------------------------------------------------------- 766 // Special SSE version skinning separated buffers of position and normal, 767 // both of position and normal buffer are packed. 768 template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned> 769 struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed 770 { applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed771 static void apply( 772 const float* pSrcPos, float* pDestPos, 773 const float* pSrcNorm, float* pDestNorm, 774 const float* pBlendWeight, const unsigned char* pBlendIndex, 775 const Matrix4* const* blendMatrices, 776 size_t blendWeightStride, size_t blendIndexStride, 777 size_t numWeightsPerVertex, 778 size_t numIterations) 779 { 780 typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; 781 typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; 782 typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor; 783 typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor; 784 785 // Blending 4 vertices per-iteration 786 for (size_t i = 0; i < numIterations; ++i) 787 { 788 // Collapse matrices 789 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 790 _collapseFourMatrices( 791 m00, m01, m02, 792 m10, m11, m12, 793 m20, m21, m22, 794 m30, m31, m32, 795 pBlendWeight, pBlendIndex, 796 blendMatrices, 797 blendWeightStride, blendIndexStride, 798 numWeightsPerVertex); 799 800 // Advance 4 vertices 801 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 802 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 803 804 //------------------------------------------------------------------ 805 // Transform positions 806 //------------------------------------------------------------------ 807 808 __m128 s0, s1, s2, d0, d1, d2; 809 810 // Load source positions 811 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 812 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 813 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 814 815 // Arrange to 3x4 component-major for batches calculate 816 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 817 818 // Transform by collapsed matrix 819 820 // Shuffle row 0 of four collapsed matrices for calculate X component 821 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 822 823 // Transform X components 824 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 825 826 // Shuffle row 1 of four collapsed matrices for calculate Y component 827 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 828 829 // Transform Y components 830 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 831 832 // Shuffle row 2 of four collapsed matrices for calculate Z component 833 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 834 835 // Transform Z components 836 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 837 838 // Arrange back to 4x3 continuous format for store results 839 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 840 841 // Store blended positions 842 DestPosAccessor::store(pDestPos + 0, d0); 843 DestPosAccessor::store(pDestPos + 4, d1); 844 DestPosAccessor::store(pDestPos + 8, d2); 845 846 // Advance 4 vertices 847 pSrcPos += 4 * 3; 848 pDestPos += 4 * 3; 849 850 //------------------------------------------------------------------ 851 // Transform normals 852 //------------------------------------------------------------------ 853 854 // Load source normals 855 s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1 856 s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2 857 s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3 858 859 // Arrange to 3x4 component-major for batches calculate 860 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 861 862 // Transform by collapsed and shuffled matrices 863 d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3 864 d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3 865 d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3 866 867 // Normalise normals 868 __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2); 869 tmp = __MM_RSQRT_PS(tmp); 870 d0 = _mm_mul_ps(d0, tmp); 871 d1 = _mm_mul_ps(d1, tmp); 872 d2 = _mm_mul_ps(d2, tmp); 873 874 // Arrange back to 4x3 continuous format for store results 875 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 876 877 // Store blended normals 878 DestNormAccessor::store(pDestNorm + 0, d0); 879 DestNormAccessor::store(pDestNorm + 4, d1); 880 DestNormAccessor::store(pDestNorm + 8, d2); 881 882 // Advance 4 vertices 883 pSrcNorm += 4 * 3; 884 pDestNorm += 4 * 3; 885 } 886 } 887 }; softwareVertexSkinning_SSE_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)888 static FORCEINLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed( 889 const float* pSrcPos, float* pDestPos, 890 const float* pSrcNorm, float* pDestNorm, 891 const float* pBlendWeight, const unsigned char* pBlendIndex, 892 const Matrix4* const* blendMatrices, 893 size_t blendWeightStride, size_t blendIndexStride, 894 size_t numWeightsPerVertex, 895 size_t numIterations) 896 { 897 assert(_isAlignedForSSE(pSrcPos)); 898 899 // Instantiating two version only, since other alignment combination not that important. 900 if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm)) 901 { 902 SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply( 903 pSrcPos, pDestPos, 904 pSrcNorm, pDestNorm, 905 pBlendWeight, pBlendIndex, 906 blendMatrices, 907 blendWeightStride, blendIndexStride, 908 numWeightsPerVertex, 909 numIterations); 910 } 911 else 912 { 913 SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply( 914 pSrcPos, pDestPos, 915 pSrcNorm, pDestNorm, 916 pBlendWeight, pBlendIndex, 917 blendMatrices, 918 blendWeightStride, blendIndexStride, 919 numWeightsPerVertex, 920 numIterations); 921 } 922 } 923 //--------------------------------------------------------------------- 924 // Special SSE version skinning position only, the position buffer are 925 // packed. 926 template <bool srcPosAligned, bool destPosAligned> 927 struct SoftwareVertexSkinning_SSE_PosOnly_Packed 928 { applyOgre::SoftwareVertexSkinning_SSE_PosOnly_Packed929 static void apply( 930 const float* pSrcPos, float* pDestPos, 931 const float* pBlendWeight, const unsigned char* pBlendIndex, 932 const Matrix4* const* blendMatrices, 933 size_t blendWeightStride, size_t blendIndexStride, 934 size_t numWeightsPerVertex, 935 size_t numIterations) 936 { 937 typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; 938 typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; 939 940 // Blending 4 vertices per-iteration 941 for (size_t i = 0; i < numIterations; ++i) 942 { 943 // Collapse matrices 944 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 945 _collapseFourMatrices( 946 m00, m01, m02, 947 m10, m11, m12, 948 m20, m21, m22, 949 m30, m31, m32, 950 pBlendWeight, pBlendIndex, 951 blendMatrices, 952 blendWeightStride, blendIndexStride, 953 numWeightsPerVertex); 954 955 // Advance 4 vertices 956 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 957 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 958 959 //------------------------------------------------------------------ 960 // Transform positions 961 //------------------------------------------------------------------ 962 963 __m128 s0, s1, s2, d0, d1, d2; 964 965 // Load source positions 966 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 967 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 968 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 969 970 // Arrange to 3x4 component-major for batches calculate 971 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 972 973 // Transform by collapsed matrix 974 975 // Shuffle row 0 of four collapsed matrices for calculate X component 976 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 977 978 // Transform X components 979 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 980 981 // Shuffle row 1 of four collapsed matrices for calculate Y component 982 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 983 984 // Transform Y components 985 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 986 987 // Shuffle row 2 of four collapsed matrices for calculate Z component 988 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 989 990 // Transform Z components 991 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 992 993 // Arrange back to 4x3 continuous format for store results 994 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 995 996 // Store blended positions 997 DestPosAccessor::store(pDestPos + 0, d0); 998 DestPosAccessor::store(pDestPos + 4, d1); 999 DestPosAccessor::store(pDestPos + 8, d2); 1000 1001 // Advance 4 vertices 1002 pSrcPos += 4 * 3; 1003 pDestPos += 4 * 3; 1004 } 1005 } 1006 }; softwareVertexSkinning_SSE_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)1007 static FORCEINLINE void softwareVertexSkinning_SSE_PosOnly_Packed( 1008 const float* pSrcPos, float* pDestPos, 1009 const float* pBlendWeight, const unsigned char* pBlendIndex, 1010 const Matrix4* const* blendMatrices, 1011 size_t blendWeightStride, size_t blendIndexStride, 1012 size_t numWeightsPerVertex, 1013 size_t numIterations) 1014 { 1015 assert(_isAlignedForSSE(pSrcPos)); 1016 1017 // Instantiating two version only, since other alignment combination not that important. 1018 if (_isAlignedForSSE(pDestPos)) 1019 { 1020 SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply( 1021 pSrcPos, pDestPos, 1022 pBlendWeight, pBlendIndex, 1023 blendMatrices, 1024 blendWeightStride, blendIndexStride, 1025 numWeightsPerVertex, 1026 numIterations); 1027 } 1028 else 1029 { 1030 SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply( 1031 pSrcPos, pDestPos, 1032 pBlendWeight, pBlendIndex, 1033 blendMatrices, 1034 blendWeightStride, blendIndexStride, 1035 numWeightsPerVertex, 1036 numIterations); 1037 } 1038 } 1039 //--------------------------------------------------------------------- 1040 //--------------------------------------------------------------------- 1041 //--------------------------------------------------------------------- OptimisedUtilSSE(void)1042 OptimisedUtilSSE::OptimisedUtilSSE(void) 1043 : mPreferGeneralVersionForSharedBuffers(false) 1044 { 1045 // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use 1046 // unrolled version for shared buffers at all, I guess because that version 1047 // run out of usable CPU registers, or L1/L2 cache related problem, causing 1048 // slight performance loss than general version. 1049 // 1050 1051 if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos) 1052 { 1053 // How can I check it's an Athlon XP but not Althon 64? 1054 // Ok, just test whether supports SSE2/SSE3 or not, if not, 1055 // assume general version faster than unrolled version :) 1056 // 1057 if (!(PlatformInformation::getCpuFeatures() & 1058 (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3))) 1059 { 1060 mPreferGeneralVersionForSharedBuffers = true; 1061 } 1062 } 1063 } 1064 //--------------------------------------------------------------------- softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Matrix4 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1065 void OptimisedUtilSSE::softwareVertexSkinning( 1066 const float *pSrcPos, float *pDestPos, 1067 const float *pSrcNorm, float *pDestNorm, 1068 const float *pBlendWeight, const unsigned char* pBlendIndex, 1069 const Matrix4* const* blendMatrices, 1070 size_t srcPosStride, size_t destPosStride, 1071 size_t srcNormStride, size_t destNormStride, 1072 size_t blendWeightStride, size_t blendIndexStride, 1073 size_t numWeightsPerVertex, 1074 size_t numVertices) 1075 { 1076 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1077 1078 // All position/normal pointers should be perfect aligned, but still check here 1079 // for avoid hardware buffer which allocated by potential buggy driver doesn't 1080 // support alignment properly. 1081 // Because we are used meta-function technique here, the code is easy to maintenance 1082 // and still provides all possible alignment combination. 1083 // 1084 1085 // Use unrolled routines only if there a lot of vertices 1086 if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES) 1087 { 1088 if (pSrcNorm) 1089 { 1090 // Blend position and normal 1091 1092 if (!mPreferGeneralVersionForSharedBuffers && 1093 srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) && 1094 pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3) 1095 { 1096 // Position and normal are sharing with packed buffer 1097 1098 size_t srcPosAlign = (size_t)pSrcPos & 15; 1099 assert((srcPosAlign & 3) == 0); 1100 1101 // Blend unaligned vertices with general SIMD routine 1102 if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex 1103 { 1104 size_t count = srcPosAlign / 8; 1105 numVertices -= count; 1106 softwareVertexSkinning_SSE_General( 1107 pSrcPos, pDestPos, 1108 pSrcNorm, pDestNorm, 1109 pBlendWeight, pBlendIndex, 1110 blendMatrices, 1111 srcPosStride, destPosStride, 1112 srcNormStride, destNormStride, 1113 blendWeightStride, blendIndexStride, 1114 numWeightsPerVertex, 1115 count); 1116 1117 pSrcPos += count * (3 + 3); 1118 pDestPos += count * (3 + 3); 1119 pSrcNorm += count * (3 + 3); 1120 pDestNorm += count * (3 + 3); 1121 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1122 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1123 } 1124 1125 // Blend vertices, four vertices per-iteration 1126 size_t numIterations = numVertices / 4; 1127 softwareVertexSkinning_SSE_PosNorm_Shared_Packed( 1128 pSrcPos, pDestPos, 1129 pBlendWeight, pBlendIndex, 1130 blendMatrices, 1131 blendWeightStride, blendIndexStride, 1132 numWeightsPerVertex, 1133 numIterations); 1134 1135 // Advance pointers for remaining vertices 1136 numVertices &= 3; 1137 if (numVertices) 1138 { 1139 pSrcPos += numIterations * 4 * (3 + 3); 1140 pDestPos += numIterations * 4 * (3 + 3); 1141 pSrcNorm += numIterations * 4 * (3 + 3); 1142 pDestNorm += numIterations * 4 * (3 + 3); 1143 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1144 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1145 } 1146 } 1147 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 && 1148 srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3) 1149 { 1150 // Position and normal are separate buffers, and all of them are packed 1151 1152 size_t srcPosAlign = (size_t)pSrcPos & 15; 1153 assert((srcPosAlign & 3) == 0); 1154 1155 // Blend unaligned vertices with general SIMD routine 1156 if (srcPosAlign) 1157 { 1158 size_t count = srcPosAlign / 4; 1159 numVertices -= count; 1160 softwareVertexSkinning_SSE_General( 1161 pSrcPos, pDestPos, 1162 pSrcNorm, pDestNorm, 1163 pBlendWeight, pBlendIndex, 1164 blendMatrices, 1165 srcPosStride, destPosStride, 1166 srcNormStride, destNormStride, 1167 blendWeightStride, blendIndexStride, 1168 numWeightsPerVertex, 1169 count); 1170 1171 pSrcPos += count * 3; 1172 pDestPos += count * 3; 1173 pSrcNorm += count * 3; 1174 pDestNorm += count * 3; 1175 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1176 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1177 } 1178 1179 // Blend vertices, four vertices per-iteration 1180 size_t numIterations = numVertices / 4; 1181 softwareVertexSkinning_SSE_PosNorm_Separated_Packed( 1182 pSrcPos, pDestPos, 1183 pSrcNorm, pDestNorm, 1184 pBlendWeight, pBlendIndex, 1185 blendMatrices, 1186 blendWeightStride, blendIndexStride, 1187 numWeightsPerVertex, 1188 numIterations); 1189 1190 // Advance pointers for remaining vertices 1191 numVertices &= 3; 1192 if (numVertices) 1193 { 1194 pSrcPos += numIterations * 4 * 3; 1195 pDestPos += numIterations * 4 * 3; 1196 pSrcNorm += numIterations * 4 * 3; 1197 pDestNorm += numIterations * 4 * 3; 1198 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1199 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1200 } 1201 } 1202 else // Not 'packed' form or wrong order between position and normal 1203 { 1204 // Should never occur, do nothing here just in case 1205 } 1206 } 1207 else // !pSrcNorm 1208 { 1209 // Blend position only 1210 1211 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3) 1212 { 1213 // All buffers are packed 1214 1215 size_t srcPosAlign = (size_t)pSrcPos & 15; 1216 assert((srcPosAlign & 3) == 0); 1217 1218 // Blend unaligned vertices with general SIMD routine 1219 if (srcPosAlign) 1220 { 1221 size_t count = srcPosAlign / 4; 1222 numVertices -= count; 1223 softwareVertexSkinning_SSE_General( 1224 pSrcPos, pDestPos, 1225 pSrcNorm, pDestNorm, 1226 pBlendWeight, pBlendIndex, 1227 blendMatrices, 1228 srcPosStride, destPosStride, 1229 srcNormStride, destNormStride, 1230 blendWeightStride, blendIndexStride, 1231 numWeightsPerVertex, 1232 count); 1233 1234 pSrcPos += count * 3; 1235 pDestPos += count * 3; 1236 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1237 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1238 } 1239 1240 // Blend vertices, four vertices per-iteration 1241 size_t numIterations = numVertices / 4; 1242 softwareVertexSkinning_SSE_PosOnly_Packed( 1243 pSrcPos, pDestPos, 1244 pBlendWeight, pBlendIndex, 1245 blendMatrices, 1246 blendWeightStride, blendIndexStride, 1247 numWeightsPerVertex, 1248 numIterations); 1249 1250 // Advance pointers for remaining vertices 1251 numVertices &= 3; 1252 if (numVertices) 1253 { 1254 pSrcPos += numIterations * 4 * 3; 1255 pDestPos += numIterations * 4 * 3; 1256 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1257 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1258 } 1259 } 1260 else // Not 'packed' form 1261 { 1262 // Might occur only if user forced software blending position only 1263 } 1264 } 1265 } 1266 1267 // Blend remaining vertices, need to do it with SIMD for identical result, 1268 // since mixing general floating-point and SIMD algorithm will causing 1269 // floating-point error. 1270 if (numVertices) 1271 { 1272 softwareVertexSkinning_SSE_General( 1273 pSrcPos, pDestPos, 1274 pSrcNorm, pDestNorm, 1275 pBlendWeight, pBlendIndex, 1276 blendMatrices, 1277 srcPosStride, destPosStride, 1278 srcNormStride, destNormStride, 1279 blendWeightStride, blendIndexStride, 1280 numWeightsPerVertex, 1281 numVertices); 1282 } 1283 } 1284 //--------------------------------------------------------------------- softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1285 void OptimisedUtilSSE::softwareVertexMorph( 1286 Real t, 1287 const float *pSrc1, const float *pSrc2, 1288 float *pDst, 1289 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 1290 size_t numVertices, 1291 bool morphNormals) 1292 { 1293 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1294 1295 __m128 src01, src02, src11, src12, src21, src22; 1296 __m128 dst0, dst1, dst2; 1297 1298 __m128 t4 = _mm_load_ps1(&t); 1299 1300 1301 // If we're morphing normals, we have twice the number of floats to process 1302 // Positions are interleaved with normals, so we'll have to separately 1303 // normalise just the normals later; we'll just lerp in the first pass 1304 // We can't normalise as we go because normals & positions are only 3 floats 1305 // each so are not aligned for SSE, we'd mix the data up 1306 size_t normalsMultiplier = morphNormals ? 2 : 1; 1307 size_t numIterations = (numVertices*normalsMultiplier) / 4; 1308 size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3; 1309 1310 // Save for later 1311 float *pStartDst = pDst; 1312 1313 // Never use meta-function technique to accessing memory because looks like 1314 // VC7.1 generate a bit inefficient binary code when put following code into 1315 // inline function. 1316 1317 if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst)) 1318 { 1319 // All data aligned 1320 1321 // Morph 4 vertices per-iteration. Special designed for use all 1322 // available CPU registers as possible (7 registers used here), 1323 // and avoid temporary values allocated in stack for suppress 1324 // extra memory access. 1325 for (size_t i = 0; i < numIterations; ++i) 1326 { 1327 // 12 floating-point values 1328 src01 = __MM_LOAD_PS(pSrc1 + 0); 1329 src02 = __MM_LOAD_PS(pSrc2 + 0); 1330 src11 = __MM_LOAD_PS(pSrc1 + 4); 1331 src12 = __MM_LOAD_PS(pSrc2 + 4); 1332 src21 = __MM_LOAD_PS(pSrc1 + 8); 1333 src22 = __MM_LOAD_PS(pSrc2 + 8); 1334 pSrc1 += 12; pSrc2 += 12; 1335 1336 dst0 = __MM_LERP_PS(t4, src01, src02); 1337 dst1 = __MM_LERP_PS(t4, src11, src12); 1338 dst2 = __MM_LERP_PS(t4, src21, src22); 1339 1340 __MM_STORE_PS(pDst + 0, dst0); 1341 __MM_STORE_PS(pDst + 4, dst1); 1342 __MM_STORE_PS(pDst + 8, dst2); 1343 pDst += 12; 1344 } 1345 1346 // Morph remaining vertices 1347 switch (numVerticesRemainder) 1348 { 1349 case 3: 1350 // 9 floating-point values 1351 src01 = __MM_LOAD_PS(pSrc1 + 0); 1352 src02 = __MM_LOAD_PS(pSrc2 + 0); 1353 src11 = __MM_LOAD_PS(pSrc1 + 4); 1354 src12 = __MM_LOAD_PS(pSrc2 + 4); 1355 src21 = _mm_load_ss(pSrc1 + 8); 1356 src22 = _mm_load_ss(pSrc2 + 8); 1357 1358 dst0 = __MM_LERP_PS(t4, src01, src02); 1359 dst1 = __MM_LERP_PS(t4, src11, src12); 1360 dst2 = __MM_LERP_SS(t4, src21, src22); 1361 1362 __MM_STORE_PS(pDst + 0, dst0); 1363 __MM_STORE_PS(pDst + 4, dst1); 1364 _mm_store_ss(pDst + 8, dst2); 1365 break; 1366 1367 case 2: 1368 // 6 floating-point values 1369 src01 = __MM_LOAD_PS(pSrc1 + 0); 1370 src02 = __MM_LOAD_PS(pSrc2 + 0); 1371 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here 1372 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here 1373 1374 dst0 = __MM_LERP_PS(t4, src01, src02); 1375 dst1 = __MM_LERP_PS(t4, src11, src12); 1376 1377 __MM_STORE_PS(pDst + 0, dst0); 1378 _mm_storel_pi((__m64*)(pDst + 4), dst1); 1379 break; 1380 1381 case 1: 1382 // 3 floating-point values 1383 src01 = _mm_load_ss(pSrc1 + 2); 1384 src02 = _mm_load_ss(pSrc2 + 2); 1385 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); 1386 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); 1387 1388 dst0 = __MM_LERP_PS(t4, src01, src02); 1389 1390 _mm_storeh_pi((__m64*)(pDst + 0), dst0); 1391 _mm_store_ss(pDst + 2, dst0); 1392 break; 1393 } 1394 } 1395 else // Should never occur, just in case of buggy drivers 1396 { 1397 // Assume all data unaligned 1398 1399 // Morph 4 vertices per-iteration. Special designed for use all 1400 // available CPU registers as possible (7 registers used here), 1401 // and avoid temporary values allocated in stack for suppress 1402 // extra memory access. 1403 for (size_t i = 0; i < numIterations; ++i) 1404 { 1405 // 12 floating-point values 1406 src01 = _mm_loadu_ps(pSrc1 + 0); 1407 src02 = _mm_loadu_ps(pSrc2 + 0); 1408 src11 = _mm_loadu_ps(pSrc1 + 4); 1409 src12 = _mm_loadu_ps(pSrc2 + 4); 1410 src21 = _mm_loadu_ps(pSrc1 + 8); 1411 src22 = _mm_loadu_ps(pSrc2 + 8); 1412 pSrc1 += 12; pSrc2 += 12; 1413 1414 dst0 = __MM_LERP_PS(t4, src01, src02); 1415 dst1 = __MM_LERP_PS(t4, src11, src12); 1416 dst2 = __MM_LERP_PS(t4, src21, src22); 1417 1418 _mm_storeu_ps(pDst + 0, dst0); 1419 _mm_storeu_ps(pDst + 4, dst1); 1420 _mm_storeu_ps(pDst + 8, dst2); 1421 pDst += 12; 1422 1423 } 1424 1425 // Morph remaining vertices 1426 switch (numVerticesRemainder) 1427 { 1428 case 3: 1429 // 9 floating-point values 1430 src01 = _mm_loadu_ps(pSrc1 + 0); 1431 src02 = _mm_loadu_ps(pSrc2 + 0); 1432 src11 = _mm_loadu_ps(pSrc1 + 4); 1433 src12 = _mm_loadu_ps(pSrc2 + 4); 1434 src21 = _mm_load_ss(pSrc1 + 8); 1435 src22 = _mm_load_ss(pSrc2 + 8); 1436 1437 dst0 = __MM_LERP_PS(t4, src01, src02); 1438 dst1 = __MM_LERP_PS(t4, src11, src12); 1439 dst2 = __MM_LERP_SS(t4, src21, src22); 1440 1441 _mm_storeu_ps(pDst + 0, dst0); 1442 _mm_storeu_ps(pDst + 4, dst1); 1443 _mm_store_ss(pDst + 8, dst2); 1444 break; 1445 1446 case 2: 1447 // 6 floating-point values 1448 src01 = _mm_loadu_ps(pSrc1 + 0); 1449 src02 = _mm_loadu_ps(pSrc2 + 0); 1450 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here 1451 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here 1452 1453 dst0 = __MM_LERP_PS(t4, src01, src02); 1454 dst1 = __MM_LERP_PS(t4, src11, src12); 1455 1456 _mm_storeu_ps(pDst + 0, dst0); 1457 _mm_storel_pi((__m64*)(pDst + 4), dst1); 1458 break; 1459 1460 case 1: 1461 // 3 floating-point values 1462 src01 = _mm_load_ss(pSrc1 + 2); 1463 src02 = _mm_load_ss(pSrc2 + 2); 1464 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); 1465 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); 1466 1467 dst0 = __MM_LERP_PS(t4, src01, src02); 1468 1469 _mm_storeh_pi((__m64*)(pDst + 0), dst0); 1470 _mm_store_ss(pDst + 2, dst0); 1471 break; 1472 } 1473 1474 } 1475 1476 if (morphNormals) 1477 { 1478 1479 // Now we need to do and unaligned normalise on the normals data we just 1480 // lerped; because normals are 3 elements each they're always unaligned 1481 float *pNorm = pStartDst; 1482 1483 // Offset past first position 1484 pNorm += 3; 1485 1486 // We'll do one normal each iteration, but still use SSE 1487 for (size_t n = 0; n < numVertices; ++n) 1488 { 1489 // normalise function 1490 __m128 norm; 1491 1492 // load 3 floating-point normal values 1493 // This loads into [0] and clears the rest 1494 norm = _mm_load_ss(pNorm + 2); 1495 // This loads into [2,3]. [1] is unused 1496 norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0)); 1497 1498 // Fill a 4-vec with vector length 1499 // square 1500 __m128 tmp = _mm_mul_ps(norm, norm); 1501 // Add - for this we want this effect: 1502 // orig 3 | 2 | 1 | 0 1503 // add1 0 | 0 | 0 | 2 1504 // add2 2 | 3 | 0 | 3 1505 // This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused) 1506 1507 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2))); 1508 // Add final combination & sqrt 1509 // bottom 3 elements of l will have length, we don't care about 4 1510 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3))); 1511 // Then divide to normalise 1512 norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp)); 1513 1514 // Store back in the same place 1515 _mm_storeh_pi((__m64*)(pNorm + 0), norm); 1516 _mm_store_ss(pNorm + 2, norm); 1517 1518 // Skip to next vertex (3x normal components, 3x position components) 1519 pNorm += 6; 1520 1521 1522 } 1523 1524 1525 } 1526 } 1527 //--------------------------------------------------------------------- concatenateAffineMatrices(const Matrix4 & baseMatrix,const Matrix4 * pSrcMat,Matrix4 * pDstMat,size_t numMatrices)1528 void OptimisedUtilSSE::concatenateAffineMatrices( 1529 const Matrix4& baseMatrix, 1530 const Matrix4* pSrcMat, 1531 Matrix4* pDstMat, 1532 size_t numMatrices) 1533 { 1534 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1535 1536 assert(_isAlignedForSSE(pSrcMat)); 1537 assert(_isAlignedForSSE(pDstMat)); 1538 1539 // Load base matrix, unaligned 1540 __m128 m0 = _mm_loadu_ps(baseMatrix[0]); 1541 __m128 m1 = _mm_loadu_ps(baseMatrix[1]); 1542 __m128 m2 = _mm_loadu_ps(baseMatrix[2]); 1543 __m128 m3 = _mm_loadu_ps(baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1) 1544 1545 for (size_t i = 0; i < numMatrices; ++i) 1546 { 1547 // Load source matrix, aligned 1548 __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]); 1549 __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]); 1550 __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]); 1551 1552 ++pSrcMat; 1553 1554 __m128 t0, t1, t2, t3; 1555 1556 // Concatenate matrix, and store results 1557 1558 // Row 0 1559 t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0); 1560 t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1); 1561 t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2); 1562 t3 = _mm_mul_ps(m0, m3); // Compiler should optimise this out of the loop 1563 __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1564 1565 // Row 1 1566 t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0); 1567 t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1); 1568 t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2); 1569 t3 = _mm_mul_ps(m1, m3); // Compiler should optimise this out of the loop 1570 __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1571 1572 // Row 2 1573 t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0); 1574 t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1); 1575 t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2); 1576 t3 = _mm_mul_ps(m2, m3); // Compiler should optimise this out of the loop 1577 __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1578 1579 // Row 3 1580 __MM_STORE_PS((*pDstMat)[3], m3); 1581 1582 ++pDstMat; 1583 } 1584 } 1585 //--------------------------------------------------------------------- calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1586 void OptimisedUtilSSE::calculateFaceNormals( 1587 const float *positions, 1588 const EdgeData::Triangle *triangles, 1589 Vector4 *faceNormals, 1590 size_t numTriangles) 1591 { 1592 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1593 1594 assert(_isAlignedForSSE(faceNormals)); 1595 1596 // Load Vector3 as: (x, 0, y, z) 1597 #define __LOAD_VECTOR3(p) _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1)) 1598 1599 // Mask used to changes sign of single precision floating point values. 1600 OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) = 1601 { 1602 0x80000000, 0x80000000, 0x80000000, 0x80000000, 1603 }; 1604 1605 size_t numIterations = numTriangles / 4; 1606 numTriangles &= 3; 1607 1608 // Four triangles per-iteration 1609 for (size_t i = 0; i < numIterations; ++i) 1610 { 1611 1612 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3) 1613 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \ 1614 { \ 1615 __m128 v0 = __LOAD_VECTOR3(p0); /* x0 -- y0 z0 */ \ 1616 __m128 v1 = __LOAD_VECTOR3(p1); /* x1 -- y1 z1 */ \ 1617 __m128 v2 = __LOAD_VECTOR3(p2); /* x2 -- y2 z2 */ \ 1618 __m128 v3 = __LOAD_VECTOR3(p3); /* x3 -- y3 z3 */ \ 1619 __m128 t0, t1; \ 1620 \ 1621 t0 = _mm_unpacklo_ps(v0, v2); /* x0 x2 -- -- */ \ 1622 t1 = _mm_unpacklo_ps(v1, v3); /* x1 x3 -- -- */ \ 1623 x = _mm_unpacklo_ps(t0, t1); /* x0 x1 x2 x3 */ \ 1624 \ 1625 t0 = _mm_unpackhi_ps(v0, v2); /* y0 y2 z0 z2 */ \ 1626 t1 = _mm_unpackhi_ps(v1, v3); /* y1 y3 z1 z3 */ \ 1627 y = _mm_unpacklo_ps(t0, t1); /* y0 y1 y2 y3 */ \ 1628 z = _mm_unpackhi_ps(t0, t1); /* z0 z1 z2 z3 */ \ 1629 } 1630 1631 __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2; 1632 1633 // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz 1634 __LOAD_FOUR_VECTOR3(x0, y0, z0, 1635 positions + triangles[0].vertIndex[0] * 3, 1636 positions + triangles[1].vertIndex[0] * 3, 1637 positions + triangles[2].vertIndex[0] * 3, 1638 positions + triangles[3].vertIndex[0] * 3); 1639 1640 // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz 1641 __LOAD_FOUR_VECTOR3(x1, y1, z1, 1642 positions + triangles[0].vertIndex[1] * 3, 1643 positions + triangles[1].vertIndex[1] * 3, 1644 positions + triangles[2].vertIndex[1] * 3, 1645 positions + triangles[3].vertIndex[1] * 3); 1646 1647 // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz 1648 __LOAD_FOUR_VECTOR3(x2, y2, z2, 1649 positions + triangles[0].vertIndex[2] * 3, 1650 positions + triangles[1].vertIndex[2] * 3, 1651 positions + triangles[2].vertIndex[2] * 3, 1652 positions + triangles[3].vertIndex[2] * 3); 1653 1654 triangles += 4; 1655 1656 // Calculate triangle face normals 1657 1658 // a = v1 - v0 1659 __m128 ax = _mm_sub_ps(x1, x0); 1660 __m128 ay = _mm_sub_ps(y1, y0); 1661 __m128 az = _mm_sub_ps(z1, z0); 1662 1663 // b = v2 - v0 1664 __m128 bx = _mm_sub_ps(x2, x0); 1665 __m128 by = _mm_sub_ps(y2, y0); 1666 __m128 bz = _mm_sub_ps(z2, z0); 1667 1668 // n = a cross b 1669 __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by)); 1670 __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz)); 1671 __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx)); 1672 1673 // w = - (n dot v0) 1674 __m128 nw = _mm_xor_ps( 1675 __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0), 1676 *(const __m128 *)&msSignMask); 1677 1678 // Arrange to per-triangle face normal major format 1679 __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw); 1680 1681 // Store results 1682 __MM_STORE_PS(&faceNormals[0].x, nx); 1683 __MM_STORE_PS(&faceNormals[1].x, ny); 1684 __MM_STORE_PS(&faceNormals[2].x, nz); 1685 __MM_STORE_PS(&faceNormals[3].x, nw); 1686 faceNormals += 4; 1687 1688 #undef __LOAD_FOUR_VECTOR3 1689 } 1690 1691 // Dealing with remaining triangles 1692 for (size_t j = 0; j < numTriangles; ++j) 1693 { 1694 // Load vertices of the triangle 1695 __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3); 1696 __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3); 1697 __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3); 1698 ++triangles; 1699 1700 // Calculate face normal 1701 1702 __m128 t0, t1; 1703 1704 __m128 a = _mm_sub_ps(v1, v0); // ax 0 ay az 1705 __m128 b = _mm_sub_ps(v2, v0); // bx 0 by bz 1706 t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3)); // az 0 ax ay 1707 t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3)); // bz 0 bx by 1708 t0 = _mm_mul_ps(t0, b); // az*bx 0 ax*by ay*bz 1709 t1 = _mm_mul_ps(t1, a); // ax*bz 0 ay*bx az*by 1710 1711 __m128 n = _mm_sub_ps(t0, t1); // ny 0 nz nx 1712 1713 __m128 d = _mm_mul_ps( // dy 0 dz dx 1714 _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n); 1715 1716 n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps( // nx ny nz -(dx+dy+dz) 1717 _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)), // nx ny nz 0 1718 _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))), // 0 0 0 dx 1719 _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))), // 0 0 0 dy 1720 _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1))); // 0 0 0 dz 1721 1722 // Store result 1723 __MM_STORE_PS(&faceNormals->x, n); 1724 ++faceNormals; 1725 } 1726 1727 #undef __LOAD_VECTOR3 1728 } 1729 //--------------------------------------------------------------------- calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1730 void OptimisedUtilSSE::calculateLightFacing( 1731 const Vector4& lightPos, 1732 const Vector4* faceNormals, 1733 char* lightFacings, 1734 size_t numFaces) 1735 { 1736 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1737 1738 assert(_isAlignedForSSE(faceNormals)); 1739 1740 // Map to convert 4-bits mask to 4 byte values 1741 static const char msMaskMapping[16][4] = 1742 { 1743 {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, 1744 {0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, 1745 {0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1}, 1746 {0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, 1747 }; 1748 1749 __m128 n0, n1, n2, n3; 1750 __m128 t0, t1; 1751 __m128 dp; 1752 int bitmask; 1753 1754 // Load light vector, unaligned 1755 __m128 lp = _mm_loadu_ps(&lightPos.x); 1756 1757 // Perload zero to register for compare dot product values 1758 __m128 zero = _mm_setzero_ps(); 1759 1760 size_t numIterations = numFaces / 4; 1761 numFaces &= 3; 1762 1763 // Four faces per-iteration 1764 for (size_t i = 0; i < numIterations; ++i) 1765 { 1766 // Load face normals, aligned 1767 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1768 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1769 n2 = __MM_LOAD_PS(&faceNormals[2].x); 1770 n3 = __MM_LOAD_PS(&faceNormals[3].x); 1771 faceNormals += 4; 1772 1773 // Multiply by light vector 1774 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1775 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1776 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 1777 n3 = _mm_mul_ps(n3, lp); // x3 y3 z3 w3 1778 1779 // Horizontal add four vector values. 1780 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1781 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1782 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1783 t1 = _mm_add_ps( // x2+z2 x3+z3 y2+w2 y3+w3 1784 _mm_unpacklo_ps(n2, n3), // x2 x3 y2 y3 1785 _mm_unpackhi_ps(n2, n3)); // z2 z3 w2 w3 1786 dp = _mm_add_ps( // dp0 dp1 dp2 dp3 1787 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3 1788 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3 1789 1790 // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps' 1791 // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch, 1792 // i.e. it's 2nd operand of the assembly instruction. And in fact 1793 // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped 1794 // in VC7.1. 1795 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1796 1797 // Convert 4-bits mask to 4 bytes, and store results. 1798 /* 1799 *reinterpret_cast<uint32*>(lightFacings) = 1800 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]); 1801 */ 1802 memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32)); 1803 1804 1805 lightFacings += 4; 1806 } 1807 1808 // Dealing with remaining faces 1809 switch (numFaces) 1810 { 1811 case 3: 1812 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1813 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1814 n2 = __MM_LOAD_PS(&faceNormals[2].x); 1815 1816 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1817 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1818 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 1819 1820 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1821 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1822 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1823 t1 = _mm_add_ps( // x2+z2 x2+z2 y2+w2 y2+w2 1824 _mm_unpacklo_ps(n2, n2), // x2 x2 y2 y2 1825 _mm_unpackhi_ps(n2, n2)); // z2 z2 w2 w2 1826 dp = _mm_add_ps( // dp0 dp1 dp2 dp2 1827 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2 1828 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2 1829 1830 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1831 1832 lightFacings[0] = msMaskMapping[bitmask][0]; 1833 lightFacings[1] = msMaskMapping[bitmask][1]; 1834 lightFacings[2] = msMaskMapping[bitmask][2]; 1835 break; 1836 1837 case 2: 1838 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1839 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1840 1841 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1842 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1843 1844 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1845 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1846 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1847 dp = _mm_add_ps( // dp0 dp1 dp0 dp1 1848 _mm_movelh_ps(t0, t0), // x0+z0 x1+z1 x0+z0 x1+z1 1849 _mm_movehl_ps(t0, t0)); // y0+w0 y1+w1 y0+w0 y1+w1 1850 1851 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1852 1853 lightFacings[0] = msMaskMapping[bitmask][0]; 1854 lightFacings[1] = msMaskMapping[bitmask][1]; 1855 break; 1856 1857 case 1: 1858 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1859 1860 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1861 1862 t0 = _mm_add_ps( // x0+z0 x0+z0 y0+w0 y0+w0 1863 _mm_unpacklo_ps(n0, n0), // x0 x0 y0 y0 1864 _mm_unpackhi_ps(n0, n0)); // z0 z0 w0 w0 1865 dp = _mm_add_ps( // dp0 dp0 dp0 dp0 1866 _mm_movelh_ps(t0, t0), // x0+z0 x0+z0 x0+z0 x0+z0 1867 _mm_movehl_ps(t0, t0)); // y0+w0 y0+w0 y0+w0 y0+w0 1868 1869 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1870 1871 lightFacings[0] = msMaskMapping[bitmask][0]; 1872 break; 1873 } 1874 } 1875 //--------------------------------------------------------------------- 1876 // Template to extrude vertices for directional light. 1877 template <bool srcAligned, bool destAligned> 1878 struct ExtrudeVertices_SSE_DirectionalLight 1879 { applyOgre::ExtrudeVertices_SSE_DirectionalLight1880 static void apply( 1881 const Vector4& lightPos, 1882 Real extrudeDist, 1883 const float* pSrcPos, 1884 float* pDestPos, 1885 size_t numVertices) 1886 { 1887 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 1888 typedef SSEMemoryAccessor<destAligned> DestAccessor; 1889 1890 // Directional light, extrusion is along light direction 1891 1892 // Load light vector, unaligned 1893 __m128 lp = _mm_loadu_ps(&lightPos.x); 1894 1895 // Calculate extrusion direction, note that we use inverted direction here 1896 // for eliminate an extra negative instruction, we'll compensate for that 1897 // by use subtract instruction instead later. 1898 __m128 tmp = _mm_mul_ps(lp, lp); 1899 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp)); 1900 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead 1901 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist)); 1902 __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z - 1903 1904 // Prepare extrude direction for extruding 4 vertices parallelly 1905 __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X 1906 __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y 1907 __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z 1908 1909 __m128 s0, s1, s2; 1910 __m128 d0, d1, d2; 1911 1912 size_t numIterations = numVertices / 4; 1913 numVertices &= 3; 1914 1915 // Extruding 4 vertices per-iteration 1916 for (size_t i = 0; i < numIterations; ++i) 1917 { 1918 s0 = SrcAccessor::load(pSrcPos + 0); 1919 s1 = SrcAccessor::load(pSrcPos + 4); 1920 s2 = SrcAccessor::load(pSrcPos + 8); 1921 pSrcPos += 12; 1922 1923 // The extrusion direction is inverted, use subtract instruction here 1924 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1925 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 1926 d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3 1927 1928 DestAccessor::store(pDestPos + 0, d0); 1929 DestAccessor::store(pDestPos + 4, d1); 1930 DestAccessor::store(pDestPos + 8, d2); 1931 pDestPos += 12; 1932 } 1933 1934 // Dealing with remaining vertices 1935 switch (numVertices) 1936 { 1937 case 3: 1938 // 9 floating-point values 1939 s0 = SrcAccessor::load(pSrcPos + 0); 1940 s1 = SrcAccessor::load(pSrcPos + 4); 1941 s2 = _mm_load_ss(pSrcPos + 8); 1942 1943 // The extrusion direction is inverted, use subtract instruction here 1944 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1945 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 1946 d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- -- 1947 1948 DestAccessor::store(pDestPos + 0, d0); 1949 DestAccessor::store(pDestPos + 4, d1); 1950 _mm_store_ss(pDestPos + 8, d2); 1951 break; 1952 1953 case 2: 1954 // 6 floating-point values 1955 s0 = SrcAccessor::load(pSrcPos + 0); 1956 s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here 1957 1958 // The extrusion direction is inverted, use subtract instruction here 1959 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1960 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- -- 1961 1962 DestAccessor::store(pDestPos + 0, d0); 1963 _mm_storel_pi((__m64*)(pDestPos + 4), d1); 1964 break; 1965 1966 case 1: 1967 // 3 floating-point values 1968 s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here 1969 s1 = _mm_load_ss(pSrcPos + 2); 1970 1971 // The extrusion direction is inverted, use subtract instruction here 1972 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- -- 1973 d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- -- 1974 1975 _mm_storel_pi((__m64*)(pDestPos + 0), d0); 1976 _mm_store_ss(pDestPos + 2, d1); 1977 break; 1978 } 1979 } 1980 }; 1981 //--------------------------------------------------------------------- 1982 // Template to extrude vertices for point light. 1983 template <bool srcAligned, bool destAligned> 1984 struct ExtrudeVertices_SSE_PointLight 1985 { applyOgre::ExtrudeVertices_SSE_PointLight1986 static void apply( 1987 const Vector4& lightPos, 1988 Real extrudeDist, 1989 const float* pSrcPos, 1990 float* pDestPos, 1991 size_t numVertices) 1992 { 1993 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 1994 typedef SSEMemoryAccessor<destAligned> DestAccessor; 1995 1996 // Point light, will calculate extrusion direction for every vertex 1997 1998 // Load light vector, unaligned 1999 __m128 lp = _mm_loadu_ps(&lightPos.x); 2000 2001 // Load extrude distance 2002 __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist); 2003 2004 size_t numIterations = numVertices / 4; 2005 numVertices &= 3; 2006 2007 // Extruding 4 vertices per-iteration 2008 for (size_t i = 0; i < numIterations; ++i) 2009 { 2010 // Load source positions 2011 __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 2012 __m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 2013 __m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 2014 pSrcPos += 12; 2015 2016 // Arrange to 3x4 component-major for batches calculate 2017 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 2018 2019 // Calculate unnormalised extrusion direction 2020 __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3 2021 __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3 2022 __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3 2023 2024 // Normalise extrusion direction and multiply by extrude distance 2025 __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz); 2026 tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4); 2027 dx = _mm_mul_ps(dx, tmp); 2028 dy = _mm_mul_ps(dy, tmp); 2029 dz = _mm_mul_ps(dz, tmp); 2030 2031 // Calculate extruded positions 2032 __m128 d0 = _mm_add_ps(dx, s0); 2033 __m128 d1 = _mm_add_ps(dy, s1); 2034 __m128 d2 = _mm_add_ps(dz, s2); 2035 2036 // Arrange back to 4x3 continuous format for store results 2037 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 2038 2039 // Store extruded positions 2040 DestAccessor::store(pDestPos + 0, d0); 2041 DestAccessor::store(pDestPos + 4, d1); 2042 DestAccessor::store(pDestPos + 8, d2); 2043 pDestPos += 12; 2044 } 2045 2046 // Dealing with remaining vertices 2047 for (size_t j = 0; j < numVertices; ++j) 2048 { 2049 // Load source position 2050 __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z 2051 pSrcPos += 3; 2052 2053 // Calculate unnormalised extrusion direction 2054 __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z 2055 2056 // Normalise extrusion direction and multiply by extrude distance 2057 __m128 tmp = _mm_mul_ps(dir, dir); 2058 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3)); 2059 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead 2060 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4); 2061 dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0)); 2062 2063 // Calculate extruded position 2064 __m128 dst = _mm_add_ps(dir, src); 2065 2066 // Store extruded position 2067 _mm_store_ss(pDestPos + 0, dst); 2068 _mm_storeh_pi((__m64*)(pDestPos + 1), dst); 2069 pDestPos += 3; 2070 } 2071 } 2072 }; 2073 //--------------------------------------------------------------------- extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)2074 void OptimisedUtilSSE::extrudeVertices( 2075 const Vector4& lightPos, 2076 Real extrudeDist, 2077 const float* pSrcPos, 2078 float* pDestPos, 2079 size_t numVertices) 2080 { 2081 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 2082 2083 // Note: Since pDestPos is following tail of pSrcPos, we can't assume 2084 // it's aligned to SIMD alignment properly, so must check for it here. 2085 // 2086 // TODO: Add extra vertex to the vertex buffer for make sure pDestPos 2087 // aligned same as pSrcPos. 2088 // 2089 2090 // We are use SSE reciprocal square root directly while calculating 2091 // extrusion direction, since precision loss not that important here. 2092 // 2093 if (lightPos.w == 0.0f) 2094 { 2095 if (_isAlignedForSSE(pSrcPos)) 2096 { 2097 if (_isAlignedForSSE(pDestPos)) 2098 ExtrudeVertices_SSE_DirectionalLight<true, true>::apply( 2099 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2100 else 2101 ExtrudeVertices_SSE_DirectionalLight<true, false>::apply( 2102 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2103 } 2104 else 2105 { 2106 if (_isAlignedForSSE(pDestPos)) 2107 ExtrudeVertices_SSE_DirectionalLight<false, true>::apply( 2108 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2109 else 2110 ExtrudeVertices_SSE_DirectionalLight<false, false>::apply( 2111 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2112 } 2113 } 2114 else 2115 { 2116 assert(lightPos.w == 1.0f); 2117 2118 if (_isAlignedForSSE(pSrcPos)) 2119 { 2120 if (_isAlignedForSSE(pDestPos)) 2121 ExtrudeVertices_SSE_PointLight<true, true>::apply( 2122 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2123 else 2124 ExtrudeVertices_SSE_PointLight<true, false>::apply( 2125 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2126 } 2127 else 2128 { 2129 if (_isAlignedForSSE(pDestPos)) 2130 ExtrudeVertices_SSE_PointLight<false, true>::apply( 2131 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2132 else 2133 ExtrudeVertices_SSE_PointLight<false, false>::apply( 2134 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2135 } 2136 } 2137 } 2138 //--------------------------------------------------------------------- 2139 //--------------------------------------------------------------------- 2140 //--------------------------------------------------------------------- _getOptimisedUtilSSE(void)2141 extern OptimisedUtil* _getOptimisedUtilSSE(void) 2142 { 2143 static OptimisedUtilSSE msOptimisedUtilSSE; 2144 #if defined(__OGRE_SIMD_ALIGN_STACK) 2145 static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE); 2146 return &msOptimisedUtilWithStackAlign; 2147 #else 2148 return &msOptimisedUtilSSE; 2149 #endif 2150 } 2151 2152 } 2153 2154 #endif // __OGRE_HAVE_SSE 2155