1 /* 2 ----------------------------------------------------------------------------- 3 This source file is part of OGRE 4 (Object-oriented Graphics Rendering Engine) 5 For the latest info, see http://www.ogre3d.org/ 6 7 Copyright (c) 2000-2014 Torus Knot Software Ltd 8 9 Permission is hereby granted, free of charge, to any person obtaining a copy 10 of this software and associated documentation files (the "Software"), to deal 11 in the Software without restriction, including without limitation the rights 12 to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 13 copies of the Software, and to permit persons to whom the Software is 14 furnished to do so, subject to the following conditions: 15 16 The above copyright notice and this permission notice shall be included in 17 all copies or substantial portions of the Software. 18 19 THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 25 THE SOFTWARE. 26 ----------------------------------------------------------------------------- 27 */ 28 #include "OgreStableHeaders.h" 29 #include "OgreOptimisedUtil.h" 30 31 32 #if __OGRE_HAVE_SSE || __OGRE_HAVE_NEON 33 34 // Should keep this includes at latest to avoid potential "xmmintrin.h" included by 35 // other header file on some platform for some reason. 36 #include "OgreSIMDHelper.h" 37 38 // I'd like to merge this file with OgreOptimisedUtil.cpp, but it's 39 // impossible when compile with gcc, due SSE instructions can only 40 // enable/disable at file level. 41 42 //------------------------------------------------------------------------- 43 // 44 // The routines implemented in this file are performance oriented, 45 // which means saving every penny as possible. This requirement might 46 // break some C++/STL-rules. 47 // 48 // 49 // Some rules I'd like to respects: 50 // 51 // 1. Had better use unpacklo/hi, movelh/hl instead of shuffle because 52 // it can saving one byte of binary code :) 53 // 2. Use add/sub instead of mul. 54 // 3. Eliminate prolog code of function call. 55 // 56 // The last, anything recommended by Intel Optimization Reference Manual. 57 // 58 //------------------------------------------------------------------------- 59 60 // Use unrolled SSE version when vertices exceeds this limit 61 #define OGRE_SSE_SKINNING_UNROLL_VERTICES 16 62 63 namespace Ogre { 64 65 //------------------------------------------------------------------------- 66 // Local classes 67 //------------------------------------------------------------------------- 68 69 /** SSE implementation of OptimisedUtil. 70 @note 71 Don't use this class directly, use OptimisedUtil instead. 72 */ 73 class _OgrePrivate OptimisedUtilSSE : public OptimisedUtil 74 { 75 protected: 76 /// Do we prefer to use a general SSE version for position/normal shared buffers? 77 bool mPreferGeneralVersionForSharedBuffers; 78 79 public: 80 /// Constructor 81 OptimisedUtilSSE(void); 82 83 /// @copydoc OptimisedUtil::softwareVertexSkinning 84 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexSkinning( 85 const float *srcPosPtr, float *destPosPtr, 86 const float *srcNormPtr, float *destNormPtr, 87 const float *blendWeightPtr, const unsigned char* blendIndexPtr, 88 const Affine3* const* blendMatrices, 89 size_t srcPosStride, size_t destPosStride, 90 size_t srcNormStride, size_t destNormStride, 91 size_t blendWeightStride, size_t blendIndexStride, 92 size_t numWeightsPerVertex, 93 size_t numVertices); 94 95 /// @copydoc OptimisedUtil::softwareVertexMorph 96 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE softwareVertexMorph( 97 Real t, 98 const float *srcPos1, const float *srcPos2, 99 float *dstPos, 100 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 101 size_t numVertices, 102 bool morphNormals); 103 104 /// @copydoc OptimisedUtil::concatenateAffineMatrices 105 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE concatenateAffineMatrices( 106 const Affine3& baseMatrix, 107 const Affine3* srcMatrices, 108 Affine3* dstMatrices, 109 size_t numMatrices); 110 111 /// @copydoc OptimisedUtil::calculateFaceNormals 112 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateFaceNormals( 113 const float *positions, 114 const EdgeData::Triangle *triangles, 115 Vector4 *faceNormals, 116 size_t numTriangles); 117 118 /// @copydoc OptimisedUtil::calculateLightFacing 119 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE calculateLightFacing( 120 const Vector4& lightPos, 121 const Vector4* faceNormals, 122 char* lightFacings, 123 size_t numFaces); 124 125 /// @copydoc OptimisedUtil::extrudeVertices 126 virtual void __OGRE_SIMD_ALIGN_ATTRIBUTE extrudeVertices( 127 const Vector4& lightPos, 128 Real extrudeDist, 129 const float* srcPositions, 130 float* destPositions, 131 size_t numVertices); 132 }; 133 134 #if defined(__OGRE_SIMD_ALIGN_STACK) 135 /** Stack-align implementation of OptimisedUtil. 136 @remarks 137 User code compiled by icc and gcc might not align stack 138 properly, we need ensure stack align to a 16-bytes boundary 139 when execute SSE function. 140 @par 141 We implemeted as align stack following a virtual function call, 142 then should guarantee call instruction are used instead of inline 143 underlying function body here (which might causing problem). 144 @note 145 Don't use this class directly, use OptimisedUtil instead. 146 */ 147 class _OgrePrivate OptimisedUtilWithStackAlign : public OptimisedUtil 148 { 149 protected: 150 /// The actual implementation 151 OptimisedUtil* mImpl; 152 153 public: 154 /// Constructor OptimisedUtilWithStackAlign(OptimisedUtil * impl)155 OptimisedUtilWithStackAlign(OptimisedUtil* impl) 156 : mImpl(impl) 157 { 158 } 159 160 /// @copydoc OptimisedUtil::softwareVertexSkinning softwareVertexSkinning(const float * srcPosPtr,float * destPosPtr,const float * srcNormPtr,float * destNormPtr,const float * blendWeightPtr,const unsigned char * blendIndexPtr,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)161 virtual void softwareVertexSkinning( 162 const float *srcPosPtr, float *destPosPtr, 163 const float *srcNormPtr, float *destNormPtr, 164 const float *blendWeightPtr, const unsigned char* blendIndexPtr, 165 const Affine3* const* blendMatrices, 166 size_t srcPosStride, size_t destPosStride, 167 size_t srcNormStride, size_t destNormStride, 168 size_t blendWeightStride, size_t blendIndexStride, 169 size_t numWeightsPerVertex, 170 size_t numVertices) 171 { 172 __OGRE_SIMD_ALIGN_STACK(); 173 174 mImpl->softwareVertexSkinning( 175 srcPosPtr, destPosPtr, 176 srcNormPtr, destNormPtr, 177 blendWeightPtr, blendIndexPtr, 178 blendMatrices, 179 srcPosStride, destPosStride, 180 srcNormStride, destNormStride, 181 blendWeightStride, blendIndexStride, 182 numWeightsPerVertex, 183 numVertices); 184 } 185 186 /// @copydoc OptimisedUtil::softwareVertexMorph softwareVertexMorph(Real t,const float * srcPos1,const float * srcPos2,float * dstPos,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)187 virtual void softwareVertexMorph( 188 Real t, 189 const float *srcPos1, const float *srcPos2, 190 float *dstPos, 191 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 192 size_t numVertices, 193 bool morphNormals) 194 { 195 __OGRE_SIMD_ALIGN_STACK(); 196 197 mImpl->softwareVertexMorph( 198 t, 199 srcPos1, srcPos2, 200 dstPos, 201 pos1VSize, pos2VSize, dstVSize, 202 numVertices, 203 morphNormals); 204 } 205 206 /// @copydoc OptimisedUtil::concatenateAffineMatrices concatenateAffineMatrices(const Affine3 & baseMatrix,const Affine3 * srcMatrices,Affine3 * dstMatrices,size_t numMatrices)207 virtual void concatenateAffineMatrices( 208 const Affine3& baseMatrix, 209 const Affine3* srcMatrices, 210 Affine3* dstMatrices, 211 size_t numMatrices) 212 { 213 __OGRE_SIMD_ALIGN_STACK(); 214 215 mImpl->concatenateAffineMatrices( 216 baseMatrix, 217 srcMatrices, 218 dstMatrices, 219 numMatrices); 220 } 221 222 /// @copydoc OptimisedUtil::calculateFaceNormals calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)223 virtual void calculateFaceNormals( 224 const float *positions, 225 const EdgeData::Triangle *triangles, 226 Vector4 *faceNormals, 227 size_t numTriangles) 228 { 229 __OGRE_SIMD_ALIGN_STACK(); 230 231 mImpl->calculateFaceNormals( 232 positions, 233 triangles, 234 faceNormals, 235 numTriangles); 236 } 237 238 /// @copydoc OptimisedUtil::calculateLightFacing calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)239 virtual void calculateLightFacing( 240 const Vector4& lightPos, 241 const Vector4* faceNormals, 242 char* lightFacings, 243 size_t numFaces) 244 { 245 __OGRE_SIMD_ALIGN_STACK(); 246 247 mImpl->calculateLightFacing( 248 lightPos, 249 faceNormals, 250 lightFacings, 251 numFaces); 252 } 253 254 /// @copydoc OptimisedUtil::extrudeVertices extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * srcPositions,float * destPositions,size_t numVertices)255 virtual void extrudeVertices( 256 const Vector4& lightPos, 257 Real extrudeDist, 258 const float* srcPositions, 259 float* destPositions, 260 size_t numVertices) 261 { 262 __OGRE_SIMD_ALIGN_STACK(); 263 264 mImpl->extrudeVertices( 265 lightPos, 266 extrudeDist, 267 srcPositions, 268 destPositions, 269 numVertices); 270 } 271 }; 272 #endif // !defined(__OGRE_SIMD_ALIGN_STACK) 273 274 //--------------------------------------------------------------------- 275 // Some useful macro for collapse matrices. 276 //--------------------------------------------------------------------- 277 278 #define __LOAD_MATRIX(row0, row1, row2, pMatrix) \ 279 { \ 280 row0 = __MM_LOAD_PS((*pMatrix)[0]); \ 281 row1 = __MM_LOAD_PS((*pMatrix)[1]); \ 282 row2 = __MM_LOAD_PS((*pMatrix)[2]); \ 283 } 284 285 #define __LERP_MATRIX(row0, row1, row2, weight, pMatrix) \ 286 { \ 287 row0 = __MM_LERP_PS(weight, row0, __MM_LOAD_PS((*pMatrix)[0])); \ 288 row1 = __MM_LERP_PS(weight, row1, __MM_LOAD_PS((*pMatrix)[1])); \ 289 row2 = __MM_LERP_PS(weight, row2, __MM_LOAD_PS((*pMatrix)[2])); \ 290 } 291 292 #define __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ 293 { \ 294 row0 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[0]), weight); \ 295 row1 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[1]), weight); \ 296 row2 = _mm_mul_ps(__MM_LOAD_PS((*pMatrix)[2]), weight); \ 297 } 298 299 #define __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix) \ 300 { \ 301 row0 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[0]), weight, row0); \ 302 row1 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[1]), weight, row1); \ 303 row2 = __MM_MADD_PS(__MM_LOAD_PS((*pMatrix)[2]), weight, row2); \ 304 } 305 306 //--------------------------------------------------------------------- 307 // The following macros request variables declared by caller. 308 // 309 // :) Thank row-major matrix used in Ogre, it make we accessing affine matrix easy. 310 //--------------------------------------------------------------------- 311 312 /** Collapse one-weighted matrix. 313 Eliminated multiply by weight since the weight should be equal to one always 314 */ 315 #define __COLLAPSE_MATRIX_W1(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 316 { \ 317 pMatrix0 = blendMatrices[pIndices[0]]; \ 318 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ 319 } 320 321 /** Collapse two-weighted matrix. 322 Based on the fact that accumulated weights are equal to one, by use lerp, 323 replaced two multiplies and one additive with one multiplie and two additives. 324 */ 325 #define __COLLAPSE_MATRIX_W2(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 326 { \ 327 weight = _mm_load_ps1(pWeights + 1); \ 328 pMatrix0 = ppMatrices[pIndices[0]]; \ 329 __LOAD_MATRIX(row0, row1, row2, pMatrix0); \ 330 pMatrix1 = ppMatrices[pIndices[1]]; \ 331 __LERP_MATRIX(row0, row1, row2, weight, pMatrix1); \ 332 } 333 334 /** Collapse three-weighted matrix. 335 */ 336 #define __COLLAPSE_MATRIX_W3(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 337 { \ 338 weight = _mm_load_ps1(pWeights + 0); \ 339 pMatrix0 = ppMatrices[pIndices[0]]; \ 340 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ 341 weight = _mm_load_ps1(pWeights + 1); \ 342 pMatrix1 = ppMatrices[pIndices[1]]; \ 343 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ 344 weight = _mm_load_ps1(pWeights + 2); \ 345 pMatrix2 = ppMatrices[pIndices[2]]; \ 346 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ 347 } 348 349 /** Collapse four-weighted matrix. 350 */ 351 #define __COLLAPSE_MATRIX_W4(row0, row1, row2, ppMatrices, pIndices, pWeights) \ 352 { \ 353 /* Load four blend weights at one time, they will be shuffled later */ \ 354 weights = _mm_loadu_ps(pWeights); \ 355 \ 356 pMatrix0 = ppMatrices[pIndices[0]]; \ 357 weight = __MM_SELECT(weights, 0); \ 358 __LOAD_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix0); \ 359 pMatrix1 = ppMatrices[pIndices[1]]; \ 360 weight = __MM_SELECT(weights, 1); \ 361 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix1); \ 362 pMatrix2 = ppMatrices[pIndices[2]]; \ 363 weight = __MM_SELECT(weights, 2); \ 364 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix2); \ 365 pMatrix3 = ppMatrices[pIndices[3]]; \ 366 weight = __MM_SELECT(weights, 3); \ 367 __ACCUM_WEIGHTED_MATRIX(row0, row1, row2, weight, pMatrix3); \ 368 } 369 370 371 372 //--------------------------------------------------------------------- 373 // Collapse a matrix at one time. The collapsed matrix are weighted by 374 // blend-weights, and then can use to transform corresponding vertex directly. 375 // 376 // I'd like use inline function instead of macro here, but I also want to 377 // ensure compiler integrate this code into its callers (release build at 378 // least), doesn't matter about specific compile options. Inline function 379 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call 380 // when implemented as inline function, even if compile with "-O3" option. 381 // 382 #define _collapseOneMatrix( \ 383 m00, m01, m02, \ 384 pBlendWeight, pBlendIndex, \ 385 blendMatrices, \ 386 blendWeightStride, blendIndexStride, \ 387 numWeightsPerVertex) \ 388 { \ 389 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ 390 /* generate wrong code here!!! */ \ 391 const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ 392 __m128 weight, weights; \ 393 \ 394 switch (numWeightsPerVertex) \ 395 { \ 396 default: /* Just in case and make compiler happy */ \ 397 case 1: \ 398 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ 399 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 400 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 401 break; \ 402 \ 403 case 2: \ 404 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ 405 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 406 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 407 break; \ 408 \ 409 case 3: \ 410 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ 411 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 412 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 413 break; \ 414 \ 415 case 4: \ 416 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ 417 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 418 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 419 break; \ 420 } \ 421 } 422 423 //--------------------------------------------------------------------- 424 // Collapse four matrices at one time. The collapsed matrix are weighted by 425 // blend-weights, and then can use to transform corresponding vertex directly. 426 // 427 // I'd like use inline function instead of macro here, but I also want to 428 // ensure compiler integrate this code into its callers (release build at 429 // least), doesn't matter about specific compile options. Inline function 430 // work fine for VC, but looks like gcc (3.4.4 here) generate function-call 431 // when implemented as inline function, even if compile with "-O3" option. 432 // 433 #define _collapseFourMatrices( \ 434 m00, m01, m02, \ 435 m10, m11, m12, \ 436 m20, m21, m22, \ 437 m30, m31, m32, \ 438 pBlendWeight, pBlendIndex, \ 439 blendMatrices, \ 440 blendWeightStride, blendIndexStride, \ 441 numWeightsPerVertex) \ 442 { \ 443 /* Important Note: If reuse pMatrixXXX frequently, M$ VC7.1 will */ \ 444 /* generate wrong code here!!! */ \ 445 const Affine3* pMatrix0, *pMatrix1, *pMatrix2, *pMatrix3; \ 446 __m128 weight, weights; \ 447 \ 448 switch (numWeightsPerVertex) \ 449 { \ 450 default: /* Just in case and make compiler happy */ \ 451 case 1: \ 452 __COLLAPSE_MATRIX_W1(m00, m01, m02, blendMatrices, \ 453 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 454 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 455 __COLLAPSE_MATRIX_W1(m10, m11, m12, blendMatrices, \ 456 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 457 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 458 __COLLAPSE_MATRIX_W1(m20, m21, m22, blendMatrices, \ 459 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 460 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 461 __COLLAPSE_MATRIX_W1(m30, m31, m32, blendMatrices, \ 462 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 463 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 464 break; \ 465 \ 466 case 2: \ 467 __COLLAPSE_MATRIX_W2(m00, m01, m02, blendMatrices, \ 468 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 469 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 470 __COLLAPSE_MATRIX_W2(m10, m11, m12, blendMatrices, \ 471 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 472 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 473 __COLLAPSE_MATRIX_W2(m20, m21, m22, blendMatrices, \ 474 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 475 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 476 __COLLAPSE_MATRIX_W2(m30, m31, m32, blendMatrices, \ 477 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 478 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 479 break; \ 480 \ 481 case 3: \ 482 __COLLAPSE_MATRIX_W3(m00, m01, m02, blendMatrices, \ 483 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 484 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 485 __COLLAPSE_MATRIX_W3(m10, m11, m12, blendMatrices, \ 486 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 487 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 488 __COLLAPSE_MATRIX_W3(m20, m21, m22, blendMatrices, \ 489 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 490 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 491 __COLLAPSE_MATRIX_W3(m30, m31, m32, blendMatrices, \ 492 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 493 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 494 break; \ 495 \ 496 case 4: \ 497 __COLLAPSE_MATRIX_W4(m00, m01, m02, blendMatrices, \ 498 rawOffsetPointer(pBlendIndex, 0 * blendIndexStride), \ 499 rawOffsetPointer(pBlendWeight, 0 * blendWeightStride)); \ 500 __COLLAPSE_MATRIX_W4(m10, m11, m12, blendMatrices, \ 501 rawOffsetPointer(pBlendIndex, 1 * blendIndexStride), \ 502 rawOffsetPointer(pBlendWeight, 1 * blendWeightStride)); \ 503 __COLLAPSE_MATRIX_W4(m20, m21, m22, blendMatrices, \ 504 rawOffsetPointer(pBlendIndex, 2 * blendIndexStride), \ 505 rawOffsetPointer(pBlendWeight, 2 * blendWeightStride)); \ 506 __COLLAPSE_MATRIX_W4(m30, m31, m32, blendMatrices, \ 507 rawOffsetPointer(pBlendIndex, 3 * blendIndexStride), \ 508 rawOffsetPointer(pBlendWeight, 3 * blendWeightStride)); \ 509 break; \ 510 } \ 511 } 512 513 514 //--------------------------------------------------------------------- 515 // General SSE version skinning positions, and optional skinning normals. softwareVertexSkinning_SSE_General(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)516 static void softwareVertexSkinning_SSE_General( 517 const float *pSrcPos, float *pDestPos, 518 const float *pSrcNorm, float *pDestNorm, 519 const float *pBlendWeight, const unsigned char* pBlendIndex, 520 const Affine3* const* blendMatrices, 521 size_t srcPosStride, size_t destPosStride, 522 size_t srcNormStride, size_t destNormStride, 523 size_t blendWeightStride, size_t blendIndexStride, 524 size_t numWeightsPerVertex, 525 size_t numVertices) 526 { 527 for (size_t i = 0; i < numVertices; ++i) 528 { 529 // Collapse matrices 530 __m128 m00, m01, m02; 531 _collapseOneMatrix( 532 m00, m01, m02, 533 pBlendWeight, pBlendIndex, 534 blendMatrices, 535 blendWeightStride, blendIndexStride, 536 numWeightsPerVertex); 537 538 // Advance blend weight and index pointers 539 advanceRawPointer(pBlendWeight, blendWeightStride); 540 advanceRawPointer(pBlendIndex, blendIndexStride); 541 542 //------------------------------------------------------------------ 543 544 // Rearrange to column-major matrix with rows shuffled order to: Z 0 X Y 545 __m128 m03 = _mm_setzero_ps(); 546 __MM_TRANSPOSE4x4_PS(m02, m03, m00, m01); 547 548 //------------------------------------------------------------------ 549 // Transform position 550 //------------------------------------------------------------------ 551 552 __m128 s0, s1, s2; 553 554 // Load source position 555 s0 = _mm_load_ps1(pSrcPos + 0); 556 s1 = _mm_load_ps1(pSrcPos + 1); 557 s2 = _mm_load_ps1(pSrcPos + 2); 558 559 // Transform by collapsed matrix 560 __m128 accumPos = __MM_DOT4x3_PS(m02, m03, m00, m01, s0, s1, s2); // z 0 x y 561 562 // Store blended position, no aligned requirement 563 _mm_storeh_pi((__m64*)pDestPos, accumPos); 564 _mm_store_ss(pDestPos+2, accumPos); 565 566 // Advance source and target position pointers 567 advanceRawPointer(pSrcPos, srcPosStride); 568 advanceRawPointer(pDestPos, destPosStride); 569 570 //------------------------------------------------------------------ 571 // Optional blend normal 572 //------------------------------------------------------------------ 573 574 if (pSrcNorm) 575 { 576 // Load source normal 577 s0 = _mm_load_ps1(pSrcNorm + 0); 578 s1 = _mm_load_ps1(pSrcNorm + 1); 579 s2 = _mm_load_ps1(pSrcNorm + 2); 580 581 // Transform by collapsed matrix 582 __m128 accumNorm = __MM_DOT3x3_PS(m02, m03, m00, s0, s1, s2); // z 0 x y 583 584 // Normalise normal 585 __m128 tmp = _mm_mul_ps(accumNorm, accumNorm); // z^2 0 x^2 y^2 586 tmp = __MM_ACCUM3_PS(tmp, 587 _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,1,2)), // x^2 0 y^2 z^2 588 _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,0,1,3))); // y^2 0 z^2 x^2 589 // Note: zero divided here, but neglectable 590 tmp = __MM_RSQRT_PS(tmp); 591 accumNorm = _mm_mul_ps(accumNorm, tmp); 592 593 // Store blended normal, no aligned requirement 594 _mm_storeh_pi((__m64*)pDestNorm, accumNorm); 595 _mm_store_ss(pDestNorm+2, accumNorm); 596 597 // Advance source and target normal pointers 598 advanceRawPointer(pSrcNorm, srcNormStride); 599 advanceRawPointer(pDestNorm, destNormStride); 600 } 601 } 602 } 603 //--------------------------------------------------------------------- 604 // Special SSE version skinning shared buffers of position and normal, 605 // and the buffer are packed. 606 template <bool srcAligned, bool destAligned> 607 struct SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed 608 { applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed609 static void apply( 610 const float* pSrc, float* pDest, 611 const float* pBlendWeight, const unsigned char* pBlendIndex, 612 const Affine3* const* blendMatrices, 613 size_t blendWeightStride, size_t blendIndexStride, 614 size_t numWeightsPerVertex, 615 size_t numIterations) 616 { 617 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 618 typedef SSEMemoryAccessor<destAligned> DestAccessor; 619 620 // Blending 4 vertices per-iteration 621 for (size_t i = 0; i < numIterations; ++i) 622 { 623 // Collapse matrices 624 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 625 _collapseFourMatrices( 626 m00, m01, m02, 627 m10, m11, m12, 628 m20, m21, m22, 629 m30, m31, m32, 630 pBlendWeight, pBlendIndex, 631 blendMatrices, 632 blendWeightStride, blendIndexStride, 633 numWeightsPerVertex); 634 635 // Advance 4 vertices 636 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 637 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 638 639 //------------------------------------------------------------------ 640 // Transform position/normals 641 //------------------------------------------------------------------ 642 643 __m128 s0, s1, s2, s3, s4, s5, d0, d1, d2, d3, d4, d5; 644 __m128 t0, t1, t2, t3, t4, t5; 645 646 // Load source position/normals 647 s0 = SrcAccessor::load(pSrc + 0); // px0 py0 pz0 nx0 648 s1 = SrcAccessor::load(pSrc + 4); // ny0 nz0 px1 py1 649 s2 = SrcAccessor::load(pSrc + 8); // pz1 nx1 ny1 nz1 650 s3 = SrcAccessor::load(pSrc + 12); // px2 py2 pz2 nx2 651 s4 = SrcAccessor::load(pSrc + 16); // ny2 nz2 px3 py3 652 s5 = SrcAccessor::load(pSrc + 20); // pz3 nx3 ny3 nz3 653 654 // Rearrange to component-major for batches calculate. 655 656 t0 = _mm_unpacklo_ps(s0, s3); // px0 px2 py0 py2 657 t1 = _mm_unpackhi_ps(s0, s3); // pz0 pz2 nx0 nx2 658 t2 = _mm_unpacklo_ps(s1, s4); // ny0 ny2 nz0 nz2 659 t3 = _mm_unpackhi_ps(s1, s4); // px1 px3 py1 py3 660 t4 = _mm_unpacklo_ps(s2, s5); // pz1 pz3 nx1 nx3 661 t5 = _mm_unpackhi_ps(s2, s5); // ny1 ny3 nz1 nz3 662 663 s0 = _mm_unpacklo_ps(t0, t3); // px0 px1 px2 px3 664 s1 = _mm_unpackhi_ps(t0, t3); // py0 py1 py2 py3 665 s2 = _mm_unpacklo_ps(t1, t4); // pz0 pz1 pz2 pz3 666 s3 = _mm_unpackhi_ps(t1, t4); // nx0 nx1 nx2 nx3 667 s4 = _mm_unpacklo_ps(t2, t5); // ny0 ny1 ny2 ny3 668 s5 = _mm_unpackhi_ps(t2, t5); // nz0 nz1 nz2 nz3 669 670 // Transform by collapsed matrix 671 672 // Shuffle row 0 of four collapsed matrices for calculate X component 673 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 674 675 // Transform X components 676 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // PX0 PX1 PX2 PX3 677 d3 = __MM_DOT3x3_PS(m00, m10, m20, s3, s4, s5); // NX0 NX1 NX2 NX3 678 679 // Shuffle row 1 of four collapsed matrices for calculate Y component 680 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 681 682 // Transform Y components 683 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // PY0 PY1 PY2 PY3 684 d4 = __MM_DOT3x3_PS(m01, m11, m21, s3, s4, s5); // NY0 NY1 NY2 NY3 685 686 // Shuffle row 2 of four collapsed matrices for calculate Z component 687 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 688 689 // Transform Z components 690 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // PZ0 PZ1 PZ2 PZ3 691 d5 = __MM_DOT3x3_PS(m02, m12, m22, s3, s4, s5); // NZ0 NZ1 NZ2 NZ3 692 693 // Normalise normals 694 __m128 tmp = __MM_DOT3x3_PS(d3, d4, d5, d3, d4, d5); 695 tmp = __MM_RSQRT_PS(tmp); 696 d3 = _mm_mul_ps(d3, tmp); 697 d4 = _mm_mul_ps(d4, tmp); 698 d5 = _mm_mul_ps(d5, tmp); 699 700 // Arrange back to continuous format for store results 701 702 t0 = _mm_unpacklo_ps(d0, d1); // PX0 PY0 PX1 PY1 703 t1 = _mm_unpackhi_ps(d0, d1); // PX2 PY2 PX3 PY3 704 t2 = _mm_unpacklo_ps(d2, d3); // PZ0 NX0 PZ1 NX1 705 t3 = _mm_unpackhi_ps(d2, d3); // PZ2 NX2 PZ3 NX3 706 t4 = _mm_unpacklo_ps(d4, d5); // NY0 NZ0 NY1 NZ1 707 t5 = _mm_unpackhi_ps(d4, d5); // NY2 NZ2 NY3 NZ3 708 709 d0 = _mm_movelh_ps(t0, t2); // PX0 PY0 PZ0 NX0 710 d1 = _mm_shuffle_ps(t4, t0, _MM_SHUFFLE(3,2,1,0)); // NY0 NZ0 PX1 PY1 711 d2 = _mm_movehl_ps(t4, t2); // PZ1 NX1 NY1 NZ1 712 d3 = _mm_movelh_ps(t1, t3); // PX2 PY2 PZ2 NX2 713 d4 = _mm_shuffle_ps(t5, t1, _MM_SHUFFLE(3,2,1,0)); // NY2 NZ2 PX3 PY3 714 d5 = _mm_movehl_ps(t5, t3); // PZ3 NX3 NY3 NZ3 715 716 // Store blended position/normals 717 DestAccessor::store(pDest + 0, d0); 718 DestAccessor::store(pDest + 4, d1); 719 DestAccessor::store(pDest + 8, d2); 720 DestAccessor::store(pDest + 12, d3); 721 DestAccessor::store(pDest + 16, d4); 722 DestAccessor::store(pDest + 20, d5); 723 724 // Advance 4 vertices 725 pSrc += 4 * (3 + 3); 726 pDest += 4 * (3 + 3); 727 } 728 } 729 }; softwareVertexSkinning_SSE_PosNorm_Shared_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)730 static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Shared_Packed( 731 const float* pSrcPos, float* pDestPos, 732 const float* pBlendWeight, const unsigned char* pBlendIndex, 733 const Affine3* const* blendMatrices, 734 size_t blendWeightStride, size_t blendIndexStride, 735 size_t numWeightsPerVertex, 736 size_t numIterations) 737 { 738 // pSrcPos might can't align to 16 bytes because 8 bytes alignment shift per-vertex 739 740 // Instantiating two version only, since other alignment combinations are not that important. 741 if (_isAlignedForSSE(pSrcPos) && _isAlignedForSSE(pDestPos)) 742 { 743 SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<true, true>::apply( 744 pSrcPos, pDestPos, 745 pBlendWeight, pBlendIndex, 746 blendMatrices, 747 blendWeightStride, blendIndexStride, 748 numWeightsPerVertex, 749 numIterations); 750 } 751 else 752 { 753 SoftwareVertexSkinning_SSE_PosNorm_Shared_Packed<false, false>::apply( 754 pSrcPos, pDestPos, 755 pBlendWeight, pBlendIndex, 756 blendMatrices, 757 blendWeightStride, blendIndexStride, 758 numWeightsPerVertex, 759 numIterations); 760 } 761 } 762 //--------------------------------------------------------------------- 763 // Special SSE version skinning separated buffers of position and normal, 764 // both of position and normal buffer are packed. 765 template <bool srcPosAligned, bool destPosAligned, bool srcNormAligned, bool destNormAligned> 766 struct SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed 767 { applyOgre::SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed768 static void apply( 769 const float* pSrcPos, float* pDestPos, 770 const float* pSrcNorm, float* pDestNorm, 771 const float* pBlendWeight, const unsigned char* pBlendIndex, 772 const Affine3* const* blendMatrices, 773 size_t blendWeightStride, size_t blendIndexStride, 774 size_t numWeightsPerVertex, 775 size_t numIterations) 776 { 777 typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; 778 typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; 779 typedef SSEMemoryAccessor<srcNormAligned> SrcNormAccessor; 780 typedef SSEMemoryAccessor<destNormAligned> DestNormAccessor; 781 782 // Blending 4 vertices per-iteration 783 for (size_t i = 0; i < numIterations; ++i) 784 { 785 // Collapse matrices 786 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 787 _collapseFourMatrices( 788 m00, m01, m02, 789 m10, m11, m12, 790 m20, m21, m22, 791 m30, m31, m32, 792 pBlendWeight, pBlendIndex, 793 blendMatrices, 794 blendWeightStride, blendIndexStride, 795 numWeightsPerVertex); 796 797 // Advance 4 vertices 798 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 799 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 800 801 //------------------------------------------------------------------ 802 // Transform positions 803 //------------------------------------------------------------------ 804 805 __m128 s0, s1, s2, d0, d1, d2; 806 807 // Load source positions 808 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 809 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 810 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 811 812 // Arrange to 3x4 component-major for batches calculate 813 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 814 815 // Transform by collapsed matrix 816 817 // Shuffle row 0 of four collapsed matrices for calculate X component 818 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 819 820 // Transform X components 821 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 822 823 // Shuffle row 1 of four collapsed matrices for calculate Y component 824 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 825 826 // Transform Y components 827 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 828 829 // Shuffle row 2 of four collapsed matrices for calculate Z component 830 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 831 832 // Transform Z components 833 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 834 835 // Arrange back to 4x3 continuous format for store results 836 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 837 838 // Store blended positions 839 DestPosAccessor::store(pDestPos + 0, d0); 840 DestPosAccessor::store(pDestPos + 4, d1); 841 DestPosAccessor::store(pDestPos + 8, d2); 842 843 // Advance 4 vertices 844 pSrcPos += 4 * 3; 845 pDestPos += 4 * 3; 846 847 //------------------------------------------------------------------ 848 // Transform normals 849 //------------------------------------------------------------------ 850 851 // Load source normals 852 s0 = SrcNormAccessor::load(pSrcNorm + 0); // x0 y0 z0 x1 853 s1 = SrcNormAccessor::load(pSrcNorm + 4); // y1 z1 x2 y2 854 s2 = SrcNormAccessor::load(pSrcNorm + 8); // z2 x3 y3 z3 855 856 // Arrange to 3x4 component-major for batches calculate 857 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 858 859 // Transform by collapsed and shuffled matrices 860 d0 = __MM_DOT3x3_PS(m00, m10, m20, s0, s1, s2); // X0 X1 X2 X3 861 d1 = __MM_DOT3x3_PS(m01, m11, m21, s0, s1, s2); // Y0 Y1 Y2 Y3 862 d2 = __MM_DOT3x3_PS(m02, m12, m22, s0, s1, s2); // Z0 Z1 Z2 Z3 863 864 // Normalise normals 865 __m128 tmp = __MM_DOT3x3_PS(d0, d1, d2, d0, d1, d2); 866 tmp = __MM_RSQRT_PS(tmp); 867 d0 = _mm_mul_ps(d0, tmp); 868 d1 = _mm_mul_ps(d1, tmp); 869 d2 = _mm_mul_ps(d2, tmp); 870 871 // Arrange back to 4x3 continuous format for store results 872 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 873 874 // Store blended normals 875 DestNormAccessor::store(pDestNorm + 0, d0); 876 DestNormAccessor::store(pDestNorm + 4, d1); 877 DestNormAccessor::store(pDestNorm + 8, d2); 878 879 // Advance 4 vertices 880 pSrcNorm += 4 * 3; 881 pDestNorm += 4 * 3; 882 } 883 } 884 }; softwareVertexSkinning_SSE_PosNorm_Separated_Packed(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)885 static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosNorm_Separated_Packed( 886 const float* pSrcPos, float* pDestPos, 887 const float* pSrcNorm, float* pDestNorm, 888 const float* pBlendWeight, const unsigned char* pBlendIndex, 889 const Affine3* const* blendMatrices, 890 size_t blendWeightStride, size_t blendIndexStride, 891 size_t numWeightsPerVertex, 892 size_t numIterations) 893 { 894 assert(_isAlignedForSSE(pSrcPos)); 895 896 // Instantiating two version only, since other alignment combination not that important. 897 if (_isAlignedForSSE(pSrcNorm) && _isAlignedForSSE(pDestPos) && _isAlignedForSSE(pDestNorm)) 898 { 899 SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, true, true, true>::apply( 900 pSrcPos, pDestPos, 901 pSrcNorm, pDestNorm, 902 pBlendWeight, pBlendIndex, 903 blendMatrices, 904 blendWeightStride, blendIndexStride, 905 numWeightsPerVertex, 906 numIterations); 907 } 908 else 909 { 910 SoftwareVertexSkinning_SSE_PosNorm_Separated_Packed<true, false, false, false>::apply( 911 pSrcPos, pDestPos, 912 pSrcNorm, pDestNorm, 913 pBlendWeight, pBlendIndex, 914 blendMatrices, 915 blendWeightStride, blendIndexStride, 916 numWeightsPerVertex, 917 numIterations); 918 } 919 } 920 //--------------------------------------------------------------------- 921 // Special SSE version skinning position only, the position buffer are 922 // packed. 923 template <bool srcPosAligned, bool destPosAligned> 924 struct SoftwareVertexSkinning_SSE_PosOnly_Packed 925 { applyOgre::SoftwareVertexSkinning_SSE_PosOnly_Packed926 static void apply( 927 const float* pSrcPos, float* pDestPos, 928 const float* pBlendWeight, const unsigned char* pBlendIndex, 929 const Affine3* const* blendMatrices, 930 size_t blendWeightStride, size_t blendIndexStride, 931 size_t numWeightsPerVertex, 932 size_t numIterations) 933 { 934 typedef SSEMemoryAccessor<srcPosAligned> SrcPosAccessor; 935 typedef SSEMemoryAccessor<destPosAligned> DestPosAccessor; 936 937 // Blending 4 vertices per-iteration 938 for (size_t i = 0; i < numIterations; ++i) 939 { 940 // Collapse matrices 941 __m128 m00, m01, m02, m10, m11, m12, m20, m21, m22, m30, m31, m32; 942 _collapseFourMatrices( 943 m00, m01, m02, 944 m10, m11, m12, 945 m20, m21, m22, 946 m30, m31, m32, 947 pBlendWeight, pBlendIndex, 948 blendMatrices, 949 blendWeightStride, blendIndexStride, 950 numWeightsPerVertex); 951 952 // Advance 4 vertices 953 advanceRawPointer(pBlendWeight, 4 * blendWeightStride); 954 advanceRawPointer(pBlendIndex, 4 * blendIndexStride); 955 956 //------------------------------------------------------------------ 957 // Transform positions 958 //------------------------------------------------------------------ 959 960 __m128 s0, s1, s2, d0, d1, d2; 961 962 // Load source positions 963 s0 = SrcPosAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 964 s1 = SrcPosAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 965 s2 = SrcPosAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 966 967 // Arrange to 3x4 component-major for batches calculate 968 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 969 970 // Transform by collapsed matrix 971 972 // Shuffle row 0 of four collapsed matrices for calculate X component 973 __MM_TRANSPOSE4x4_PS(m00, m10, m20, m30); 974 975 // Transform X components 976 d0 = __MM_DOT4x3_PS(m00, m10, m20, m30, s0, s1, s2); // X0 X1 X2 X3 977 978 // Shuffle row 1 of four collapsed matrices for calculate Y component 979 __MM_TRANSPOSE4x4_PS(m01, m11, m21, m31); 980 981 // Transform Y components 982 d1 = __MM_DOT4x3_PS(m01, m11, m21, m31, s0, s1, s2); // Y0 Y1 Y2 Y3 983 984 // Shuffle row 2 of four collapsed matrices for calculate Z component 985 __MM_TRANSPOSE4x4_PS(m02, m12, m22, m32); 986 987 // Transform Z components 988 d2 = __MM_DOT4x3_PS(m02, m12, m22, m32, s0, s1, s2); // Z0 Z1 Z2 Z3 989 990 // Arrange back to 4x3 continuous format for store results 991 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 992 993 // Store blended positions 994 DestPosAccessor::store(pDestPos + 0, d0); 995 DestPosAccessor::store(pDestPos + 4, d1); 996 DestPosAccessor::store(pDestPos + 8, d2); 997 998 // Advance 4 vertices 999 pSrcPos += 4 * 3; 1000 pDestPos += 4 * 3; 1001 } 1002 } 1003 }; softwareVertexSkinning_SSE_PosOnly_Packed(const float * pSrcPos,float * pDestPos,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numIterations)1004 static OGRE_FORCE_INLINE void softwareVertexSkinning_SSE_PosOnly_Packed( 1005 const float* pSrcPos, float* pDestPos, 1006 const float* pBlendWeight, const unsigned char* pBlendIndex, 1007 const Affine3* const* blendMatrices, 1008 size_t blendWeightStride, size_t blendIndexStride, 1009 size_t numWeightsPerVertex, 1010 size_t numIterations) 1011 { 1012 assert(_isAlignedForSSE(pSrcPos)); 1013 1014 // Instantiating two version only, since other alignment combination not that important. 1015 if (_isAlignedForSSE(pDestPos)) 1016 { 1017 SoftwareVertexSkinning_SSE_PosOnly_Packed<true, true>::apply( 1018 pSrcPos, pDestPos, 1019 pBlendWeight, pBlendIndex, 1020 blendMatrices, 1021 blendWeightStride, blendIndexStride, 1022 numWeightsPerVertex, 1023 numIterations); 1024 } 1025 else 1026 { 1027 SoftwareVertexSkinning_SSE_PosOnly_Packed<true, false>::apply( 1028 pSrcPos, pDestPos, 1029 pBlendWeight, pBlendIndex, 1030 blendMatrices, 1031 blendWeightStride, blendIndexStride, 1032 numWeightsPerVertex, 1033 numIterations); 1034 } 1035 } 1036 //--------------------------------------------------------------------- 1037 //--------------------------------------------------------------------- 1038 //--------------------------------------------------------------------- OptimisedUtilSSE(void)1039 OptimisedUtilSSE::OptimisedUtilSSE(void) 1040 : mPreferGeneralVersionForSharedBuffers(false) 1041 { 1042 // For AMD Athlon XP (but not that for Althon 64), it's prefer to never use 1043 // unrolled version for shared buffers at all, I guess because that version 1044 // run out of usable CPU registers, or L1/L2 cache related problem, causing 1045 // slight performance loss than general version. 1046 // 1047 #if __OGRE_HAVE_NEON == 0 1048 if (PlatformInformation::getCpuIdentifier().find("AuthenticAMD") != String::npos) 1049 { 1050 // How can I check it's an Athlon XP but not Althon 64? 1051 // Ok, just test whether supports SSE2/SSE3 or not, if not, 1052 // assume general version faster than unrolled version :) 1053 // 1054 if (!(PlatformInformation::getCpuFeatures() & 1055 (PlatformInformation::CPU_FEATURE_SSE2 | PlatformInformation::CPU_FEATURE_SSE3))) 1056 { 1057 mPreferGeneralVersionForSharedBuffers = true; 1058 } 1059 } 1060 #endif 1061 } 1062 //--------------------------------------------------------------------- softwareVertexSkinning(const float * pSrcPos,float * pDestPos,const float * pSrcNorm,float * pDestNorm,const float * pBlendWeight,const unsigned char * pBlendIndex,const Affine3 * const * blendMatrices,size_t srcPosStride,size_t destPosStride,size_t srcNormStride,size_t destNormStride,size_t blendWeightStride,size_t blendIndexStride,size_t numWeightsPerVertex,size_t numVertices)1063 void OptimisedUtilSSE::softwareVertexSkinning( 1064 const float *pSrcPos, float *pDestPos, 1065 const float *pSrcNorm, float *pDestNorm, 1066 const float *pBlendWeight, const unsigned char* pBlendIndex, 1067 const Affine3* const* blendMatrices, 1068 size_t srcPosStride, size_t destPosStride, 1069 size_t srcNormStride, size_t destNormStride, 1070 size_t blendWeightStride, size_t blendIndexStride, 1071 size_t numWeightsPerVertex, 1072 size_t numVertices) 1073 { 1074 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1075 1076 // All position/normal pointers should be perfect aligned, but still check here 1077 // for avoid hardware buffer which allocated by potential buggy driver doesn't 1078 // support alignment properly. 1079 // Because we are used meta-function technique here, the code is easy to maintenance 1080 // and still provides all possible alignment combination. 1081 // 1082 1083 // Use unrolled routines only if there a lot of vertices 1084 if (numVertices > OGRE_SSE_SKINNING_UNROLL_VERTICES) 1085 { 1086 if (pSrcNorm) 1087 { 1088 // Blend position and normal 1089 1090 if (!mPreferGeneralVersionForSharedBuffers && 1091 srcPosStride == sizeof(float) * (3 + 3) && destPosStride == sizeof(float) * (3 + 3) && 1092 pSrcNorm == pSrcPos + 3 && pDestNorm == pDestPos + 3) 1093 { 1094 // Position and normal are sharing with packed buffer 1095 1096 size_t srcPosAlign = (size_t)pSrcPos & 15; 1097 assert((srcPosAlign & 3) == 0); 1098 1099 // Blend unaligned vertices with general SIMD routine 1100 if (srcPosAlign == 8) // Because 8 bytes alignment shift per-vertex 1101 { 1102 size_t count = srcPosAlign / 8; 1103 numVertices -= count; 1104 softwareVertexSkinning_SSE_General( 1105 pSrcPos, pDestPos, 1106 pSrcNorm, pDestNorm, 1107 pBlendWeight, pBlendIndex, 1108 blendMatrices, 1109 srcPosStride, destPosStride, 1110 srcNormStride, destNormStride, 1111 blendWeightStride, blendIndexStride, 1112 numWeightsPerVertex, 1113 count); 1114 1115 pSrcPos += count * (3 + 3); 1116 pDestPos += count * (3 + 3); 1117 pSrcNorm += count * (3 + 3); 1118 pDestNorm += count * (3 + 3); 1119 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1120 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1121 } 1122 1123 // Blend vertices, four vertices per-iteration 1124 size_t numIterations = numVertices / 4; 1125 softwareVertexSkinning_SSE_PosNorm_Shared_Packed( 1126 pSrcPos, pDestPos, 1127 pBlendWeight, pBlendIndex, 1128 blendMatrices, 1129 blendWeightStride, blendIndexStride, 1130 numWeightsPerVertex, 1131 numIterations); 1132 1133 // Advance pointers for remaining vertices 1134 numVertices &= 3; 1135 if (numVertices) 1136 { 1137 pSrcPos += numIterations * 4 * (3 + 3); 1138 pDestPos += numIterations * 4 * (3 + 3); 1139 pSrcNorm += numIterations * 4 * (3 + 3); 1140 pDestNorm += numIterations * 4 * (3 + 3); 1141 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1142 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1143 } 1144 } 1145 else if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3 && 1146 srcNormStride == sizeof(float) * 3 && destNormStride == sizeof(float) * 3) 1147 { 1148 // Position and normal are separate buffers, and all of them are packed 1149 1150 size_t srcPosAlign = (size_t)pSrcPos & 15; 1151 assert((srcPosAlign & 3) == 0); 1152 1153 // Blend unaligned vertices with general SIMD routine 1154 if (srcPosAlign) 1155 { 1156 size_t count = srcPosAlign / 4; 1157 numVertices -= count; 1158 softwareVertexSkinning_SSE_General( 1159 pSrcPos, pDestPos, 1160 pSrcNorm, pDestNorm, 1161 pBlendWeight, pBlendIndex, 1162 blendMatrices, 1163 srcPosStride, destPosStride, 1164 srcNormStride, destNormStride, 1165 blendWeightStride, blendIndexStride, 1166 numWeightsPerVertex, 1167 count); 1168 1169 pSrcPos += count * 3; 1170 pDestPos += count * 3; 1171 pSrcNorm += count * 3; 1172 pDestNorm += count * 3; 1173 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1174 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1175 } 1176 1177 // Blend vertices, four vertices per-iteration 1178 size_t numIterations = numVertices / 4; 1179 softwareVertexSkinning_SSE_PosNorm_Separated_Packed( 1180 pSrcPos, pDestPos, 1181 pSrcNorm, pDestNorm, 1182 pBlendWeight, pBlendIndex, 1183 blendMatrices, 1184 blendWeightStride, blendIndexStride, 1185 numWeightsPerVertex, 1186 numIterations); 1187 1188 // Advance pointers for remaining vertices 1189 numVertices &= 3; 1190 if (numVertices) 1191 { 1192 pSrcPos += numIterations * 4 * 3; 1193 pDestPos += numIterations * 4 * 3; 1194 pSrcNorm += numIterations * 4 * 3; 1195 pDestNorm += numIterations * 4 * 3; 1196 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1197 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1198 } 1199 } 1200 else // Not 'packed' form or wrong order between position and normal 1201 { 1202 // Should never occur, do nothing here just in case 1203 } 1204 } 1205 else // !pSrcNorm 1206 { 1207 // Blend position only 1208 1209 if (srcPosStride == sizeof(float) * 3 && destPosStride == sizeof(float) * 3) 1210 { 1211 // All buffers are packed 1212 1213 size_t srcPosAlign = (size_t)pSrcPos & 15; 1214 assert((srcPosAlign & 3) == 0); 1215 1216 // Blend unaligned vertices with general SIMD routine 1217 if (srcPosAlign) 1218 { 1219 size_t count = srcPosAlign / 4; 1220 numVertices -= count; 1221 softwareVertexSkinning_SSE_General( 1222 pSrcPos, pDestPos, 1223 pSrcNorm, pDestNorm, 1224 pBlendWeight, pBlendIndex, 1225 blendMatrices, 1226 srcPosStride, destPosStride, 1227 srcNormStride, destNormStride, 1228 blendWeightStride, blendIndexStride, 1229 numWeightsPerVertex, 1230 count); 1231 1232 pSrcPos += count * 3; 1233 pDestPos += count * 3; 1234 advanceRawPointer(pBlendWeight, count * blendWeightStride); 1235 advanceRawPointer(pBlendIndex, count * blendIndexStride); 1236 } 1237 1238 // Blend vertices, four vertices per-iteration 1239 size_t numIterations = numVertices / 4; 1240 softwareVertexSkinning_SSE_PosOnly_Packed( 1241 pSrcPos, pDestPos, 1242 pBlendWeight, pBlendIndex, 1243 blendMatrices, 1244 blendWeightStride, blendIndexStride, 1245 numWeightsPerVertex, 1246 numIterations); 1247 1248 // Advance pointers for remaining vertices 1249 numVertices &= 3; 1250 if (numVertices) 1251 { 1252 pSrcPos += numIterations * 4 * 3; 1253 pDestPos += numIterations * 4 * 3; 1254 advanceRawPointer(pBlendWeight, numIterations * 4 * blendWeightStride); 1255 advanceRawPointer(pBlendIndex, numIterations * 4 * blendIndexStride); 1256 } 1257 } 1258 else // Not 'packed' form 1259 { 1260 // Might occur only if user forced software blending position only 1261 } 1262 } 1263 } 1264 1265 // Blend remaining vertices, need to do it with SIMD for identical result, 1266 // since mixing general floating-point and SIMD algorithm will causing 1267 // floating-point error. 1268 if (numVertices) 1269 { 1270 softwareVertexSkinning_SSE_General( 1271 pSrcPos, pDestPos, 1272 pSrcNorm, pDestNorm, 1273 pBlendWeight, pBlendIndex, 1274 blendMatrices, 1275 srcPosStride, destPosStride, 1276 srcNormStride, destNormStride, 1277 blendWeightStride, blendIndexStride, 1278 numWeightsPerVertex, 1279 numVertices); 1280 } 1281 } 1282 //--------------------------------------------------------------------- softwareVertexMorph(Real t,const float * pSrc1,const float * pSrc2,float * pDst,size_t pos1VSize,size_t pos2VSize,size_t dstVSize,size_t numVertices,bool morphNormals)1283 void OptimisedUtilSSE::softwareVertexMorph( 1284 Real t, 1285 const float *pSrc1, const float *pSrc2, 1286 float *pDst, 1287 size_t pos1VSize, size_t pos2VSize, size_t dstVSize, 1288 size_t numVertices, 1289 bool morphNormals) 1290 { 1291 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1292 1293 __m128 src01, src02, src11, src12, src21, src22; 1294 __m128 dst0, dst1, dst2; 1295 1296 __m128 t4 = _mm_load_ps1(&t); 1297 1298 1299 // If we're morphing normals, we have twice the number of floats to process 1300 // Positions are interleaved with normals, so we'll have to separately 1301 // normalise just the normals later; we'll just lerp in the first pass 1302 // We can't normalise as we go because normals & positions are only 3 floats 1303 // each so are not aligned for SSE, we'd mix the data up 1304 size_t normalsMultiplier = morphNormals ? 2 : 1; 1305 size_t numIterations = (numVertices*normalsMultiplier) / 4; 1306 size_t numVerticesRemainder = (numVertices*normalsMultiplier) & 3; 1307 1308 // Save for later 1309 float *pStartDst = pDst; 1310 1311 // Never use meta-function technique to accessing memory because looks like 1312 // VC7.1 generate a bit inefficient binary code when put following code into 1313 // inline function. 1314 1315 if (_isAlignedForSSE(pSrc1) && _isAlignedForSSE(pSrc2) && _isAlignedForSSE(pDst)) 1316 { 1317 // All data aligned 1318 1319 // Morph 4 vertices per-iteration. Special designed for use all 1320 // available CPU registers as possible (7 registers used here), 1321 // and avoid temporary values allocated in stack for suppress 1322 // extra memory access. 1323 for (size_t i = 0; i < numIterations; ++i) 1324 { 1325 // 12 floating-point values 1326 src01 = __MM_LOAD_PS(pSrc1 + 0); 1327 src02 = __MM_LOAD_PS(pSrc2 + 0); 1328 src11 = __MM_LOAD_PS(pSrc1 + 4); 1329 src12 = __MM_LOAD_PS(pSrc2 + 4); 1330 src21 = __MM_LOAD_PS(pSrc1 + 8); 1331 src22 = __MM_LOAD_PS(pSrc2 + 8); 1332 pSrc1 += 12; pSrc2 += 12; 1333 1334 dst0 = __MM_LERP_PS(t4, src01, src02); 1335 dst1 = __MM_LERP_PS(t4, src11, src12); 1336 dst2 = __MM_LERP_PS(t4, src21, src22); 1337 1338 __MM_STORE_PS(pDst + 0, dst0); 1339 __MM_STORE_PS(pDst + 4, dst1); 1340 __MM_STORE_PS(pDst + 8, dst2); 1341 pDst += 12; 1342 } 1343 1344 // Morph remaining vertices 1345 switch (numVerticesRemainder) 1346 { 1347 case 3: 1348 // 9 floating-point values 1349 src01 = __MM_LOAD_PS(pSrc1 + 0); 1350 src02 = __MM_LOAD_PS(pSrc2 + 0); 1351 src11 = __MM_LOAD_PS(pSrc1 + 4); 1352 src12 = __MM_LOAD_PS(pSrc2 + 4); 1353 src21 = _mm_load_ss(pSrc1 + 8); 1354 src22 = _mm_load_ss(pSrc2 + 8); 1355 1356 dst0 = __MM_LERP_PS(t4, src01, src02); 1357 dst1 = __MM_LERP_PS(t4, src11, src12); 1358 dst2 = __MM_LERP_SS(t4, src21, src22); 1359 1360 __MM_STORE_PS(pDst + 0, dst0); 1361 __MM_STORE_PS(pDst + 4, dst1); 1362 _mm_store_ss(pDst + 8, dst2); 1363 break; 1364 1365 case 2: 1366 // 6 floating-point values 1367 src01 = __MM_LOAD_PS(pSrc1 + 0); 1368 src02 = __MM_LOAD_PS(pSrc2 + 0); 1369 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here 1370 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here 1371 1372 dst0 = __MM_LERP_PS(t4, src01, src02); 1373 dst1 = __MM_LERP_PS(t4, src11, src12); 1374 1375 __MM_STORE_PS(pDst + 0, dst0); 1376 _mm_storel_pi((__m64*)(pDst + 4), dst1); 1377 break; 1378 1379 case 1: 1380 // 3 floating-point values 1381 src01 = _mm_load_ss(pSrc1 + 2); 1382 src02 = _mm_load_ss(pSrc2 + 2); 1383 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); 1384 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); 1385 1386 dst0 = __MM_LERP_PS(t4, src01, src02); 1387 1388 _mm_storeh_pi((__m64*)(pDst + 0), dst0); 1389 _mm_store_ss(pDst + 2, dst0); 1390 break; 1391 } 1392 } 1393 else // Should never occur, just in case of buggy drivers 1394 { 1395 // Assume all data unaligned 1396 1397 // Morph 4 vertices per-iteration. Special designed for use all 1398 // available CPU registers as possible (7 registers used here), 1399 // and avoid temporary values allocated in stack for suppress 1400 // extra memory access. 1401 for (size_t i = 0; i < numIterations; ++i) 1402 { 1403 // 12 floating-point values 1404 src01 = _mm_loadu_ps(pSrc1 + 0); 1405 src02 = _mm_loadu_ps(pSrc2 + 0); 1406 src11 = _mm_loadu_ps(pSrc1 + 4); 1407 src12 = _mm_loadu_ps(pSrc2 + 4); 1408 src21 = _mm_loadu_ps(pSrc1 + 8); 1409 src22 = _mm_loadu_ps(pSrc2 + 8); 1410 pSrc1 += 12; pSrc2 += 12; 1411 1412 dst0 = __MM_LERP_PS(t4, src01, src02); 1413 dst1 = __MM_LERP_PS(t4, src11, src12); 1414 dst2 = __MM_LERP_PS(t4, src21, src22); 1415 1416 _mm_storeu_ps(pDst + 0, dst0); 1417 _mm_storeu_ps(pDst + 4, dst1); 1418 _mm_storeu_ps(pDst + 8, dst2); 1419 pDst += 12; 1420 1421 } 1422 1423 // Morph remaining vertices 1424 switch (numVerticesRemainder) 1425 { 1426 case 3: 1427 // 9 floating-point values 1428 src01 = _mm_loadu_ps(pSrc1 + 0); 1429 src02 = _mm_loadu_ps(pSrc2 + 0); 1430 src11 = _mm_loadu_ps(pSrc1 + 4); 1431 src12 = _mm_loadu_ps(pSrc2 + 4); 1432 src21 = _mm_load_ss(pSrc1 + 8); 1433 src22 = _mm_load_ss(pSrc2 + 8); 1434 1435 dst0 = __MM_LERP_PS(t4, src01, src02); 1436 dst1 = __MM_LERP_PS(t4, src11, src12); 1437 dst2 = __MM_LERP_SS(t4, src21, src22); 1438 1439 _mm_storeu_ps(pDst + 0, dst0); 1440 _mm_storeu_ps(pDst + 4, dst1); 1441 _mm_store_ss(pDst + 8, dst2); 1442 break; 1443 1444 case 2: 1445 // 6 floating-point values 1446 src01 = _mm_loadu_ps(pSrc1 + 0); 1447 src02 = _mm_loadu_ps(pSrc2 + 0); 1448 src11 = _mm_loadl_pi(t4, (const __m64*)(pSrc1 + 4)); // t4 is meaningless here 1449 src12 = _mm_loadl_pi(t4, (const __m64*)(pSrc2 + 4)); // t4 is meaningless here 1450 1451 dst0 = __MM_LERP_PS(t4, src01, src02); 1452 dst1 = __MM_LERP_PS(t4, src11, src12); 1453 1454 _mm_storeu_ps(pDst + 0, dst0); 1455 _mm_storel_pi((__m64*)(pDst + 4), dst1); 1456 break; 1457 1458 case 1: 1459 // 3 floating-point values 1460 src01 = _mm_load_ss(pSrc1 + 2); 1461 src02 = _mm_load_ss(pSrc2 + 2); 1462 src01 = _mm_loadh_pi(src01, (const __m64*)(pSrc1 + 0)); 1463 src02 = _mm_loadh_pi(src02, (const __m64*)(pSrc2 + 0)); 1464 1465 dst0 = __MM_LERP_PS(t4, src01, src02); 1466 1467 _mm_storeh_pi((__m64*)(pDst + 0), dst0); 1468 _mm_store_ss(pDst + 2, dst0); 1469 break; 1470 } 1471 1472 } 1473 1474 if (morphNormals) 1475 { 1476 1477 // Now we need to do and unaligned normalise on the normals data we just 1478 // lerped; because normals are 3 elements each they're always unaligned 1479 float *pNorm = pStartDst; 1480 1481 // Offset past first position 1482 pNorm += 3; 1483 1484 // We'll do one normal each iteration, but still use SSE 1485 for (size_t n = 0; n < numVertices; ++n) 1486 { 1487 // normalise function 1488 __m128 norm; 1489 1490 // load 3 floating-point normal values 1491 // This loads into [0] and clears the rest 1492 norm = _mm_load_ss(pNorm + 2); 1493 // This loads into [2,3]. [1] is unused 1494 norm = _mm_loadh_pi(norm, (__m64*)(pNorm + 0)); 1495 1496 // Fill a 4-vec with vector length 1497 // square 1498 __m128 tmp = _mm_mul_ps(norm, norm); 1499 // Add - for this we want this effect: 1500 // orig 3 | 2 | 1 | 0 1501 // add1 0 | 0 | 0 | 2 1502 // add2 2 | 3 | 0 | 3 1503 // This way elements 0, 2 and 3 have the sum of all entries (except 1 which is unused) 1504 1505 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,0,0,2))); 1506 // Add final combination & sqrt 1507 // bottom 3 elements of l will have length, we don't care about 4 1508 tmp = _mm_add_ps(tmp, _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(2,3,0,3))); 1509 // Then divide to normalise 1510 norm = _mm_div_ps(norm, _mm_sqrt_ps(tmp)); 1511 1512 // Store back in the same place 1513 _mm_storeh_pi((__m64*)(pNorm + 0), norm); 1514 _mm_store_ss(pNorm + 2, norm); 1515 1516 // Skip to next vertex (3x normal components, 3x position components) 1517 pNorm += 6; 1518 1519 1520 } 1521 1522 1523 } 1524 } 1525 //--------------------------------------------------------------------- concatenateAffineMatrices(const Affine3 & baseMatrix,const Affine3 * pSrcMat,Affine3 * pDstMat,size_t numMatrices)1526 void OptimisedUtilSSE::concatenateAffineMatrices( 1527 const Affine3& baseMatrix, 1528 const Affine3* pSrcMat, 1529 Affine3* pDstMat, 1530 size_t numMatrices) 1531 { 1532 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1533 1534 assert(_isAlignedForSSE(pSrcMat)); 1535 assert(_isAlignedForSSE(pDstMat)); 1536 1537 // Load base matrix, unaligned 1538 __m128 m0 = _mm_loadu_ps(baseMatrix[0]); 1539 __m128 m1 = _mm_loadu_ps(baseMatrix[1]); 1540 __m128 m2 = _mm_loadu_ps(baseMatrix[2]); 1541 __m128 m3 = _mm_loadu_ps(baseMatrix[3]); // m3 should be equal to (0, 0, 0, 1) 1542 1543 for (size_t i = 0; i < numMatrices; ++i) 1544 { 1545 // Load source matrix, aligned 1546 __m128 s0 = __MM_LOAD_PS((*pSrcMat)[0]); 1547 __m128 s1 = __MM_LOAD_PS((*pSrcMat)[1]); 1548 __m128 s2 = __MM_LOAD_PS((*pSrcMat)[2]); 1549 1550 ++pSrcMat; 1551 1552 __m128 t0, t1, t2, t3; 1553 1554 // Concatenate matrix, and store results 1555 1556 // Row 0 1557 t0 = _mm_mul_ps(__MM_SELECT(m0, 0), s0); 1558 t1 = _mm_mul_ps(__MM_SELECT(m0, 1), s1); 1559 t2 = _mm_mul_ps(__MM_SELECT(m0, 2), s2); 1560 t3 = _mm_mul_ps(m0, m3); // Compiler should optimise this out of the loop 1561 __MM_STORE_PS((*pDstMat)[0], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1562 1563 // Row 1 1564 t0 = _mm_mul_ps(__MM_SELECT(m1, 0), s0); 1565 t1 = _mm_mul_ps(__MM_SELECT(m1, 1), s1); 1566 t2 = _mm_mul_ps(__MM_SELECT(m1, 2), s2); 1567 t3 = _mm_mul_ps(m1, m3); // Compiler should optimise this out of the loop 1568 __MM_STORE_PS((*pDstMat)[1], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1569 1570 // Row 2 1571 t0 = _mm_mul_ps(__MM_SELECT(m2, 0), s0); 1572 t1 = _mm_mul_ps(__MM_SELECT(m2, 1), s1); 1573 t2 = _mm_mul_ps(__MM_SELECT(m2, 2), s2); 1574 t3 = _mm_mul_ps(m2, m3); // Compiler should optimise this out of the loop 1575 __MM_STORE_PS((*pDstMat)[2], __MM_ACCUM4_PS(t0,t1,t2,t3)); 1576 1577 ++pDstMat; 1578 } 1579 } 1580 //--------------------------------------------------------------------- calculateFaceNormals(const float * positions,const EdgeData::Triangle * triangles,Vector4 * faceNormals,size_t numTriangles)1581 void OptimisedUtilSSE::calculateFaceNormals( 1582 const float *positions, 1583 const EdgeData::Triangle *triangles, 1584 Vector4 *faceNormals, 1585 size_t numTriangles) 1586 { 1587 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1588 1589 assert(_isAlignedForSSE(faceNormals)); 1590 1591 // Load Vector3 as: (x, 0, y, z) 1592 #define __LOAD_VECTOR3(p) _mm_loadh_pi(_mm_load_ss(p), (const __m64*)((p)+1)) 1593 1594 // Mask used to changes sign of single precision floating point values. 1595 OGRE_SIMD_ALIGNED_DECL(static const uint32, msSignMask[4]) = 1596 { 1597 0x80000000, 0x80000000, 0x80000000, 0x80000000, 1598 }; 1599 1600 size_t numIterations = numTriangles / 4; 1601 numTriangles &= 3; 1602 1603 // Four triangles per-iteration 1604 for (size_t i = 0; i < numIterations; ++i) 1605 { 1606 1607 // Load four Vector3 as: (x0, x1, x2, x3), (y0, y1, y2, y3), (z0, z1, z2, z3) 1608 #define __LOAD_FOUR_VECTOR3(x, y, z, p0, p1, p2, p3) \ 1609 { \ 1610 __m128 v0 = __LOAD_VECTOR3(p0); /* x0 -- y0 z0 */ \ 1611 __m128 v1 = __LOAD_VECTOR3(p1); /* x1 -- y1 z1 */ \ 1612 __m128 v2 = __LOAD_VECTOR3(p2); /* x2 -- y2 z2 */ \ 1613 __m128 v3 = __LOAD_VECTOR3(p3); /* x3 -- y3 z3 */ \ 1614 __m128 t0, t1; \ 1615 \ 1616 t0 = _mm_unpacklo_ps(v0, v2); /* x0 x2 -- -- */ \ 1617 t1 = _mm_unpacklo_ps(v1, v3); /* x1 x3 -- -- */ \ 1618 x = _mm_unpacklo_ps(t0, t1); /* x0 x1 x2 x3 */ \ 1619 \ 1620 t0 = _mm_unpackhi_ps(v0, v2); /* y0 y2 z0 z2 */ \ 1621 t1 = _mm_unpackhi_ps(v1, v3); /* y1 y3 z1 z3 */ \ 1622 y = _mm_unpacklo_ps(t0, t1); /* y0 y1 y2 y3 */ \ 1623 z = _mm_unpackhi_ps(t0, t1); /* z0 z1 z2 z3 */ \ 1624 } 1625 1626 __m128 x0, x1, x2, y0, y1, y2, z0, z1, z2; 1627 1628 // Load vertex 0 of four triangles, packed as component-major format: xxxx yyyy zzzz 1629 __LOAD_FOUR_VECTOR3(x0, y0, z0, 1630 positions + triangles[0].vertIndex[0] * 3, 1631 positions + triangles[1].vertIndex[0] * 3, 1632 positions + triangles[2].vertIndex[0] * 3, 1633 positions + triangles[3].vertIndex[0] * 3); 1634 1635 // Load vertex 1 of four triangles, packed as component-major format: xxxx yyyy zzzz 1636 __LOAD_FOUR_VECTOR3(x1, y1, z1, 1637 positions + triangles[0].vertIndex[1] * 3, 1638 positions + triangles[1].vertIndex[1] * 3, 1639 positions + triangles[2].vertIndex[1] * 3, 1640 positions + triangles[3].vertIndex[1] * 3); 1641 1642 // Load vertex 2 of four triangles, packed as component-major format: xxxx yyyy zzzz 1643 __LOAD_FOUR_VECTOR3(x2, y2, z2, 1644 positions + triangles[0].vertIndex[2] * 3, 1645 positions + triangles[1].vertIndex[2] * 3, 1646 positions + triangles[2].vertIndex[2] * 3, 1647 positions + triangles[3].vertIndex[2] * 3); 1648 1649 triangles += 4; 1650 1651 // Calculate triangle face normals 1652 1653 // a = v1 - v0 1654 __m128 ax = _mm_sub_ps(x1, x0); 1655 __m128 ay = _mm_sub_ps(y1, y0); 1656 __m128 az = _mm_sub_ps(z1, z0); 1657 1658 // b = v2 - v0 1659 __m128 bx = _mm_sub_ps(x2, x0); 1660 __m128 by = _mm_sub_ps(y2, y0); 1661 __m128 bz = _mm_sub_ps(z2, z0); 1662 1663 // n = a cross b 1664 __m128 nx = _mm_sub_ps(_mm_mul_ps(ay, bz), _mm_mul_ps(az, by)); 1665 __m128 ny = _mm_sub_ps(_mm_mul_ps(az, bx), _mm_mul_ps(ax, bz)); 1666 __m128 nz = _mm_sub_ps(_mm_mul_ps(ax, by), _mm_mul_ps(ay, bx)); 1667 1668 // w = - (n dot v0) 1669 __m128 nw = _mm_xor_ps( 1670 __MM_DOT3x3_PS(nx, ny, nz, x0, y0, z0), 1671 *(const __m128 *)&msSignMask); 1672 1673 // Arrange to per-triangle face normal major format 1674 __MM_TRANSPOSE4x4_PS(nx, ny, nz, nw); 1675 1676 // Store results 1677 __MM_STORE_PS(&faceNormals[0].x, nx); 1678 __MM_STORE_PS(&faceNormals[1].x, ny); 1679 __MM_STORE_PS(&faceNormals[2].x, nz); 1680 __MM_STORE_PS(&faceNormals[3].x, nw); 1681 faceNormals += 4; 1682 1683 #undef __LOAD_FOUR_VECTOR3 1684 } 1685 1686 // Dealing with remaining triangles 1687 for (size_t j = 0; j < numTriangles; ++j) 1688 { 1689 // Load vertices of the triangle 1690 __m128 v0 = __LOAD_VECTOR3(positions + triangles->vertIndex[0] * 3); 1691 __m128 v1 = __LOAD_VECTOR3(positions + triangles->vertIndex[1] * 3); 1692 __m128 v2 = __LOAD_VECTOR3(positions + triangles->vertIndex[2] * 3); 1693 ++triangles; 1694 1695 // Calculate face normal 1696 1697 __m128 t0, t1; 1698 1699 __m128 a = _mm_sub_ps(v1, v0); // ax 0 ay az 1700 __m128 b = _mm_sub_ps(v2, v0); // bx 0 by bz 1701 t0 = _mm_shuffle_ps(a, a, _MM_SHUFFLE(2,0,1,3)); // az 0 ax ay 1702 t1 = _mm_shuffle_ps(b, b, _MM_SHUFFLE(2,0,1,3)); // bz 0 bx by 1703 t0 = _mm_mul_ps(t0, b); // az*bx 0 ax*by ay*bz 1704 t1 = _mm_mul_ps(t1, a); // ax*bz 0 ay*bx az*by 1705 1706 __m128 n = _mm_sub_ps(t0, t1); // ny 0 nz nx 1707 1708 __m128 d = _mm_mul_ps( // dy 0 dz dx 1709 _mm_shuffle_ps(v0, v0, _MM_SHUFFLE(0,3,1,2)), n); 1710 1711 n = _mm_sub_ps(_mm_sub_ps(_mm_sub_ps( // nx ny nz -(dx+dy+dz) 1712 _mm_shuffle_ps(n, n, _MM_SHUFFLE(1,2,0,3)), // nx ny nz 0 1713 _mm_shuffle_ps(d, d, _MM_SHUFFLE(3,1,1,1))), // 0 0 0 dx 1714 _mm_shuffle_ps(d, d, _MM_SHUFFLE(0,1,1,1))), // 0 0 0 dy 1715 _mm_shuffle_ps(d, d, _MM_SHUFFLE(2,1,1,1))); // 0 0 0 dz 1716 1717 // Store result 1718 __MM_STORE_PS(&faceNormals->x, n); 1719 ++faceNormals; 1720 } 1721 1722 #undef __LOAD_VECTOR3 1723 } 1724 //--------------------------------------------------------------------- calculateLightFacing(const Vector4 & lightPos,const Vector4 * faceNormals,char * lightFacings,size_t numFaces)1725 void OptimisedUtilSSE::calculateLightFacing( 1726 const Vector4& lightPos, 1727 const Vector4* faceNormals, 1728 char* lightFacings, 1729 size_t numFaces) 1730 { 1731 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 1732 1733 assert(_isAlignedForSSE(faceNormals)); 1734 1735 // Map to convert 4-bits mask to 4 byte values 1736 static const char msMaskMapping[16][4] = 1737 { 1738 {0, 0, 0, 0}, {1, 0, 0, 0}, {0, 1, 0, 0}, {1, 1, 0, 0}, 1739 {0, 0, 1, 0}, {1, 0, 1, 0}, {0, 1, 1, 0}, {1, 1, 1, 0}, 1740 {0, 0, 0, 1}, {1, 0, 0, 1}, {0, 1, 0, 1}, {1, 1, 0, 1}, 1741 {0, 0, 1, 1}, {1, 0, 1, 1}, {0, 1, 1, 1}, {1, 1, 1, 1}, 1742 }; 1743 1744 __m128 n0, n1, n2, n3; 1745 __m128 t0, t1; 1746 __m128 dp; 1747 int bitmask; 1748 1749 // Load light vector, unaligned 1750 __m128 lp = _mm_loadu_ps(&lightPos.x); 1751 1752 // Perload zero to register for compare dot product values 1753 __m128 zero = _mm_setzero_ps(); 1754 1755 size_t numIterations = numFaces / 4; 1756 numFaces &= 3; 1757 1758 // Four faces per-iteration 1759 for (size_t i = 0; i < numIterations; ++i) 1760 { 1761 // Load face normals, aligned 1762 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1763 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1764 n2 = __MM_LOAD_PS(&faceNormals[2].x); 1765 n3 = __MM_LOAD_PS(&faceNormals[3].x); 1766 faceNormals += 4; 1767 1768 // Multiply by light vector 1769 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1770 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1771 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 1772 n3 = _mm_mul_ps(n3, lp); // x3 y3 z3 w3 1773 1774 // Horizontal add four vector values. 1775 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1776 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1777 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1778 t1 = _mm_add_ps( // x2+z2 x3+z3 y2+w2 y3+w3 1779 _mm_unpacklo_ps(n2, n3), // x2 x3 y2 y3 1780 _mm_unpackhi_ps(n2, n3)); // z2 z3 w2 w3 1781 dp = _mm_add_ps( // dp0 dp1 dp2 dp3 1782 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x3+z3 1783 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y3+w3 1784 1785 // Compare greater than zero and setup 4-bits mask. Use '_mm_cmpnle_ps' 1786 // instead of '_mm_cmpgt_ps' here because we want keep 'zero' untouch, 1787 // i.e. it's 2nd operand of the assembly instruction. And in fact 1788 // '_mm_cmpgt_ps' was implemented as 'CMPLTPS' with operands swapped 1789 // in VC7.1. 1790 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1791 1792 // Convert 4-bits mask to 4 bytes, and store results. 1793 /* 1794 *reinterpret_cast<uint32*>(lightFacings) = 1795 *reinterpret_cast<const uint32*>(msMaskMapping[bitmask]); 1796 */ 1797 memcpy(lightFacings, msMaskMapping[bitmask], sizeof(uint32)); 1798 1799 1800 lightFacings += 4; 1801 } 1802 1803 // Dealing with remaining faces 1804 switch (numFaces) 1805 { 1806 case 3: 1807 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1808 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1809 n2 = __MM_LOAD_PS(&faceNormals[2].x); 1810 1811 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1812 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1813 n2 = _mm_mul_ps(n2, lp); // x2 y2 z2 w2 1814 1815 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1816 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1817 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1818 t1 = _mm_add_ps( // x2+z2 x2+z2 y2+w2 y2+w2 1819 _mm_unpacklo_ps(n2, n2), // x2 x2 y2 y2 1820 _mm_unpackhi_ps(n2, n2)); // z2 z2 w2 w2 1821 dp = _mm_add_ps( // dp0 dp1 dp2 dp2 1822 _mm_movelh_ps(t0, t1), // x0+z0 x1+z1 x2+z2 x2+z2 1823 _mm_movehl_ps(t1, t0)); // y0+w0 y1+w1 y2+w2 y2+w2 1824 1825 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1826 1827 lightFacings[0] = msMaskMapping[bitmask][0]; 1828 lightFacings[1] = msMaskMapping[bitmask][1]; 1829 lightFacings[2] = msMaskMapping[bitmask][2]; 1830 break; 1831 1832 case 2: 1833 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1834 n1 = __MM_LOAD_PS(&faceNormals[1].x); 1835 1836 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1837 n1 = _mm_mul_ps(n1, lp); // x1 y1 z1 w1 1838 1839 t0 = _mm_add_ps( // x0+z0 x1+z1 y0+w0 y1+w1 1840 _mm_unpacklo_ps(n0, n1), // x0 x1 y0 y1 1841 _mm_unpackhi_ps(n0, n1)); // z0 z1 w0 w1 1842 dp = _mm_add_ps( // dp0 dp1 dp0 dp1 1843 _mm_movelh_ps(t0, t0), // x0+z0 x1+z1 x0+z0 x1+z1 1844 _mm_movehl_ps(t0, t0)); // y0+w0 y1+w1 y0+w0 y1+w1 1845 1846 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1847 1848 lightFacings[0] = msMaskMapping[bitmask][0]; 1849 lightFacings[1] = msMaskMapping[bitmask][1]; 1850 break; 1851 1852 case 1: 1853 n0 = __MM_LOAD_PS(&faceNormals[0].x); 1854 1855 n0 = _mm_mul_ps(n0, lp); // x0 y0 z0 w0 1856 1857 t0 = _mm_add_ps( // x0+z0 x0+z0 y0+w0 y0+w0 1858 _mm_unpacklo_ps(n0, n0), // x0 x0 y0 y0 1859 _mm_unpackhi_ps(n0, n0)); // z0 z0 w0 w0 1860 dp = _mm_add_ps( // dp0 dp0 dp0 dp0 1861 _mm_movelh_ps(t0, t0), // x0+z0 x0+z0 x0+z0 x0+z0 1862 _mm_movehl_ps(t0, t0)); // y0+w0 y0+w0 y0+w0 y0+w0 1863 1864 bitmask = _mm_movemask_ps(_mm_cmpnle_ps(dp, zero)); 1865 1866 lightFacings[0] = msMaskMapping[bitmask][0]; 1867 break; 1868 } 1869 } 1870 //--------------------------------------------------------------------- 1871 // Template to extrude vertices for directional light. 1872 template <bool srcAligned, bool destAligned> 1873 struct ExtrudeVertices_SSE_DirectionalLight 1874 { applyOgre::ExtrudeVertices_SSE_DirectionalLight1875 static void apply( 1876 const Vector4& lightPos, 1877 Real extrudeDist, 1878 const float* pSrcPos, 1879 float* pDestPos, 1880 size_t numVertices) 1881 { 1882 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 1883 typedef SSEMemoryAccessor<destAligned> DestAccessor; 1884 1885 // Directional light, extrusion is along light direction 1886 1887 // Load light vector, unaligned 1888 __m128 lp = _mm_loadu_ps(&lightPos.x); 1889 1890 // Calculate extrusion direction, note that we use inverted direction here 1891 // for eliminate an extra negative instruction, we'll compensate for that 1892 // by use subtract instruction instead later. 1893 __m128 tmp = _mm_mul_ps(lp, lp); 1894 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_shuffle_ps(tmp, tmp, 1)), _mm_movehl_ps(tmp, tmp)); 1895 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead 1896 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), _mm_load_ss(&extrudeDist)); 1897 __m128 dir = _mm_mul_ps(lp, __MM_SELECT(tmp, 0)); // X Y Z - 1898 1899 // Prepare extrude direction for extruding 4 vertices parallelly 1900 __m128 dir0 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(0,2,1,0)); // X Y Z X 1901 __m128 dir1 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(1,0,2,1)); // Y Z X Y 1902 __m128 dir2 = _mm_shuffle_ps(dir, dir, _MM_SHUFFLE(2,1,0,2)); // Z X Y Z 1903 1904 __m128 s0, s1, s2; 1905 __m128 d0, d1, d2; 1906 1907 size_t numIterations = numVertices / 4; 1908 numVertices &= 3; 1909 1910 // Extruding 4 vertices per-iteration 1911 for (size_t i = 0; i < numIterations; ++i) 1912 { 1913 s0 = SrcAccessor::load(pSrcPos + 0); 1914 s1 = SrcAccessor::load(pSrcPos + 4); 1915 s2 = SrcAccessor::load(pSrcPos + 8); 1916 pSrcPos += 12; 1917 1918 // The extrusion direction is inverted, use subtract instruction here 1919 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1920 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 1921 d2 = _mm_sub_ps(s2, dir2); // Z2 X3 Y3 Z3 1922 1923 DestAccessor::store(pDestPos + 0, d0); 1924 DestAccessor::store(pDestPos + 4, d1); 1925 DestAccessor::store(pDestPos + 8, d2); 1926 pDestPos += 12; 1927 } 1928 1929 // Dealing with remaining vertices 1930 switch (numVertices) 1931 { 1932 case 3: 1933 // 9 floating-point values 1934 s0 = SrcAccessor::load(pSrcPos + 0); 1935 s1 = SrcAccessor::load(pSrcPos + 4); 1936 s2 = _mm_load_ss(pSrcPos + 8); 1937 1938 // The extrusion direction is inverted, use subtract instruction here 1939 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1940 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 X2 Y2 1941 d2 = _mm_sub_ss(s2, dir2); // Z2 -- -- -- 1942 1943 DestAccessor::store(pDestPos + 0, d0); 1944 DestAccessor::store(pDestPos + 4, d1); 1945 _mm_store_ss(pDestPos + 8, d2); 1946 break; 1947 1948 case 2: 1949 // 6 floating-point values 1950 s0 = SrcAccessor::load(pSrcPos + 0); 1951 s1 = _mm_loadl_pi(dir1, (const __m64*)(pSrcPos + 4)); // dir1 is meaningless here 1952 1953 // The extrusion direction is inverted, use subtract instruction here 1954 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 Z0 X1 1955 d1 = _mm_sub_ps(s1, dir1); // Y1 Z1 -- -- 1956 1957 DestAccessor::store(pDestPos + 0, d0); 1958 _mm_storel_pi((__m64*)(pDestPos + 4), d1); 1959 break; 1960 1961 case 1: 1962 // 3 floating-point values 1963 s0 = _mm_loadl_pi(dir0, (const __m64*)(pSrcPos + 0)); // dir0 is meaningless here 1964 s1 = _mm_load_ss(pSrcPos + 2); 1965 1966 // The extrusion direction is inverted, use subtract instruction here 1967 d0 = _mm_sub_ps(s0, dir0); // X0 Y0 -- -- 1968 d1 = _mm_sub_ss(s1, dir2); // Z0 -- -- -- 1969 1970 _mm_storel_pi((__m64*)(pDestPos + 0), d0); 1971 _mm_store_ss(pDestPos + 2, d1); 1972 break; 1973 } 1974 } 1975 }; 1976 //--------------------------------------------------------------------- 1977 // Template to extrude vertices for point light. 1978 template <bool srcAligned, bool destAligned> 1979 struct ExtrudeVertices_SSE_PointLight 1980 { applyOgre::ExtrudeVertices_SSE_PointLight1981 static void apply( 1982 const Vector4& lightPos, 1983 Real extrudeDist, 1984 const float* pSrcPos, 1985 float* pDestPos, 1986 size_t numVertices) 1987 { 1988 typedef SSEMemoryAccessor<srcAligned> SrcAccessor; 1989 typedef SSEMemoryAccessor<destAligned> DestAccessor; 1990 1991 // Point light, will calculate extrusion direction for every vertex 1992 1993 // Load light vector, unaligned 1994 __m128 lp = _mm_loadu_ps(&lightPos.x); 1995 1996 // Load extrude distance 1997 __m128 extrudeDist4 = _mm_load_ps1(&extrudeDist); 1998 1999 size_t numIterations = numVertices / 4; 2000 numVertices &= 3; 2001 2002 // Extruding 4 vertices per-iteration 2003 for (size_t i = 0; i < numIterations; ++i) 2004 { 2005 // Load source positions 2006 __m128 s0 = SrcAccessor::load(pSrcPos + 0); // x0 y0 z0 x1 2007 __m128 s1 = SrcAccessor::load(pSrcPos + 4); // y1 z1 x2 y2 2008 __m128 s2 = SrcAccessor::load(pSrcPos + 8); // z2 x3 y3 z3 2009 pSrcPos += 12; 2010 2011 // Arrange to 3x4 component-major for batches calculate 2012 __MM_TRANSPOSE4x3_PS(s0, s1, s2); 2013 2014 // Calculate unnormalised extrusion direction 2015 __m128 dx = _mm_sub_ps(s0, __MM_SELECT(lp, 0)); // X0 X1 X2 X3 2016 __m128 dy = _mm_sub_ps(s1, __MM_SELECT(lp, 1)); // Y0 Y1 Y2 Y3 2017 __m128 dz = _mm_sub_ps(s2, __MM_SELECT(lp, 2)); // Z0 Z1 Z2 Z3 2018 2019 // Normalise extrusion direction and multiply by extrude distance 2020 __m128 tmp = __MM_DOT3x3_PS(dx, dy, dz, dx, dy, dz); 2021 tmp = _mm_mul_ps(_mm_rsqrt_ps(tmp), extrudeDist4); 2022 dx = _mm_mul_ps(dx, tmp); 2023 dy = _mm_mul_ps(dy, tmp); 2024 dz = _mm_mul_ps(dz, tmp); 2025 2026 // Calculate extruded positions 2027 __m128 d0 = _mm_add_ps(dx, s0); 2028 __m128 d1 = _mm_add_ps(dy, s1); 2029 __m128 d2 = _mm_add_ps(dz, s2); 2030 2031 // Arrange back to 4x3 continuous format for store results 2032 __MM_TRANSPOSE3x4_PS(d0, d1, d2); 2033 2034 // Store extruded positions 2035 DestAccessor::store(pDestPos + 0, d0); 2036 DestAccessor::store(pDestPos + 4, d1); 2037 DestAccessor::store(pDestPos + 8, d2); 2038 pDestPos += 12; 2039 } 2040 2041 // Dealing with remaining vertices 2042 for (size_t j = 0; j < numVertices; ++j) 2043 { 2044 // Load source position 2045 __m128 src = _mm_loadh_pi(_mm_load_ss(pSrcPos + 0), (const __m64*)(pSrcPos + 1)); // x 0 y z 2046 pSrcPos += 3; 2047 2048 // Calculate unnormalised extrusion direction 2049 __m128 dir = _mm_sub_ps(src, _mm_shuffle_ps(lp, lp, _MM_SHUFFLE(2,1,3,0))); // X 1 Y Z 2050 2051 // Normalise extrusion direction and multiply by extrude distance 2052 __m128 tmp = _mm_mul_ps(dir, dir); 2053 tmp = _mm_add_ss(_mm_add_ss(tmp, _mm_movehl_ps(tmp, tmp)), _mm_shuffle_ps(tmp, tmp, 3)); 2054 // Looks like VC7.1 generate a bit inefficient code for 'rsqrtss', so use 'rsqrtps' instead 2055 tmp = _mm_mul_ss(_mm_rsqrt_ps(tmp), extrudeDist4); 2056 dir = _mm_mul_ps(dir, __MM_SELECT(tmp, 0)); 2057 2058 // Calculate extruded position 2059 __m128 dst = _mm_add_ps(dir, src); 2060 2061 // Store extruded position 2062 _mm_store_ss(pDestPos + 0, dst); 2063 _mm_storeh_pi((__m64*)(pDestPos + 1), dst); 2064 pDestPos += 3; 2065 } 2066 } 2067 }; 2068 //--------------------------------------------------------------------- extrudeVertices(const Vector4 & lightPos,Real extrudeDist,const float * pSrcPos,float * pDestPos,size_t numVertices)2069 void OptimisedUtilSSE::extrudeVertices( 2070 const Vector4& lightPos, 2071 Real extrudeDist, 2072 const float* pSrcPos, 2073 float* pDestPos, 2074 size_t numVertices) 2075 { 2076 __OGRE_CHECK_STACK_ALIGNED_FOR_SSE(); 2077 2078 // Note: Since pDestPos is following tail of pSrcPos, we can't assume 2079 // it's aligned to SIMD alignment properly, so must check for it here. 2080 // 2081 // TODO: Add extra vertex to the vertex buffer for make sure pDestPos 2082 // aligned same as pSrcPos. 2083 // 2084 2085 // We are use SSE reciprocal square root directly while calculating 2086 // extrusion direction, since precision loss not that important here. 2087 // 2088 if (lightPos.w == 0.0f) 2089 { 2090 if (_isAlignedForSSE(pSrcPos)) 2091 { 2092 if (_isAlignedForSSE(pDestPos)) 2093 ExtrudeVertices_SSE_DirectionalLight<true, true>::apply( 2094 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2095 else 2096 ExtrudeVertices_SSE_DirectionalLight<true, false>::apply( 2097 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2098 } 2099 else 2100 { 2101 if (_isAlignedForSSE(pDestPos)) 2102 ExtrudeVertices_SSE_DirectionalLight<false, true>::apply( 2103 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2104 else 2105 ExtrudeVertices_SSE_DirectionalLight<false, false>::apply( 2106 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2107 } 2108 } 2109 else 2110 { 2111 assert(lightPos.w == 1.0f); 2112 2113 if (_isAlignedForSSE(pSrcPos)) 2114 { 2115 if (_isAlignedForSSE(pDestPos)) 2116 ExtrudeVertices_SSE_PointLight<true, true>::apply( 2117 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2118 else 2119 ExtrudeVertices_SSE_PointLight<true, false>::apply( 2120 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2121 } 2122 else 2123 { 2124 if (_isAlignedForSSE(pDestPos)) 2125 ExtrudeVertices_SSE_PointLight<false, true>::apply( 2126 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2127 else 2128 ExtrudeVertices_SSE_PointLight<false, false>::apply( 2129 lightPos, extrudeDist, pSrcPos, pDestPos, numVertices); 2130 } 2131 } 2132 } 2133 //--------------------------------------------------------------------- 2134 //--------------------------------------------------------------------- 2135 //--------------------------------------------------------------------- 2136 extern OptimisedUtil* _getOptimisedUtilSSE(void); _getOptimisedUtilSSE(void)2137 extern OptimisedUtil* _getOptimisedUtilSSE(void) 2138 { 2139 static OptimisedUtilSSE msOptimisedUtilSSE; 2140 #if defined(__OGRE_SIMD_ALIGN_STACK) 2141 static OptimisedUtilWithStackAlign msOptimisedUtilWithStackAlign(&msOptimisedUtilSSE); 2142 return &msOptimisedUtilWithStackAlign; 2143 #else 2144 return &msOptimisedUtilSSE; 2145 #endif 2146 } 2147 2148 } 2149 2150 #endif // __OGRE_HAVE_SSE 2151