1 //##################################################################### 2 // Copyright (c) 2010-2011, Eftychios Sifakis. 3 // 4 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 // other materials provided with the distribution. 8 // 9 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 //##################################################################### 16 17 #ifdef __INTEL_COMPILER 18 #pragma warning( disable : 592 ) 19 #endif 20 21 // #define USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 22 // #define PERFORM_STRICT_QUATERNION_RENORMALIZATION 23 24 { // Begin block : Scope of qV (if not maintained) 25 26 #ifndef COMPUTE_V_AS_QUATERNION ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;)27 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;) 28 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;) 29 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;) 30 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;) ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;) ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;) 31 #endif 32 33 { // Begin block : Symmetric eigenanalysis 34 35 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss11;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs11;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs11;) 36 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss21;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs21;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs21;) 37 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss31;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs31;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs31;) 38 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss22;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs22;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs22;) 39 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss32;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs32;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs32;) 40 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss33;) ENABLE_SSE_IMPLEMENTATION(__m128 Vs33;) ENABLE_AVX_IMPLEMENTATION(__m256 Vs33;) 41 42 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vone;) 43 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_xor_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_xor_ps(Vqvvx,Vqvvx);) 44 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_xor_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_xor_ps(Vqvvy,Vqvvy);) 45 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_xor_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_xor_ps(Vqvvz,Vqvvz);) 46 47 //########################################################### 48 // Compute normal equations matrix 49 //########################################################### 50 51 ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_mul_ps(Va11,Va11);) 52 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va21,Va21);) 53 ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);) 54 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va31,Va31);) 55 ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;) ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);) ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);) 56 57 ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Sa12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_mul_ps(Va12,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_mul_ps(Va12,Va11);) 58 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va21);) 59 ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);) 60 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va31);) 61 ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;) ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);) ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);) 62 63 ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Sa13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_mul_ps(Va13,Va11);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_mul_ps(Va13,Va11);) 64 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va21);) 65 ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);) 66 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va31);) 67 ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;) ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);) ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);) 68 69 ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_mul_ps(Va12,Va12);) 70 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va22);) 71 ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);) 72 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va32);) 73 ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;) ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);) ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);) 74 75 ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Sa13.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_mul_ps(Va13,Va12);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_mul_ps(Va13,Va12);) 76 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va22);) 77 ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);) 78 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va32);) 79 ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;) ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);) ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);) 80 81 ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_mul_ps(Va13,Va13);) 82 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va23);) 83 ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);) 84 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va33);) 85 ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;) ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);) ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);) 86 87 //########################################################### 88 // Solve symmetric eigenproblem using Jacobi iteration 89 //########################################################### 90 91 for(int sweep=1;sweep<=4;sweep++){ 92 93 // First Jacobi conjugation 94 95 #define SS11 Ss11 96 #define SS21 Ss21 97 #define SS31 Ss31 98 #define SS22 Ss22 99 #define SS32 Ss32 100 #define SS33 Ss33 101 #define SQVVX Sqvvx 102 #define SQVVY Sqvvy 103 #define SQVVZ Sqvvz 104 #define STMP1 Stmp1 105 #define STMP2 Stmp2 106 #define STMP3 Stmp3 107 108 #define VS11 Vs11 109 #define VS21 Vs21 110 #define VS31 Vs31 111 #define VS22 Vs22 112 #define VS32 Vs32 113 #define VS33 Vs33 114 #define VQVVX Vqvvx 115 #define VQVVY Vqvvy 116 #define VQVVZ Vqvvz 117 #define VTMP1 Vtmp1 118 #define VTMP2 Vtmp2 119 #define VTMP3 Vtmp3 120 121 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 122 123 #undef SS11 124 #undef SS21 125 #undef SS31 126 #undef SS22 127 #undef SS32 128 #undef SS33 129 #undef SQVVX 130 #undef SQVVY 131 #undef SQVVZ 132 #undef STMP1 133 #undef STMP2 134 #undef STMP3 135 136 #undef VS11 137 #undef VS21 138 #undef VS31 139 #undef VS22 140 #undef VS32 141 #undef VS33 142 #undef VQVVX 143 #undef VQVVY 144 #undef VQVVZ 145 #undef VTMP1 146 #undef VTMP2 147 #undef VTMP3 148 149 // Second Jacobi conjugation 150 151 #define SS11 Ss22 152 #define SS21 Ss32 153 #define SS31 Ss21 154 #define SS22 Ss33 155 #define SS32 Ss31 156 #define SS33 Ss11 157 #define SQVVX Sqvvy 158 #define SQVVY Sqvvz 159 #define SQVVZ Sqvvx 160 #define STMP1 Stmp2 161 #define STMP2 Stmp3 162 #define STMP3 Stmp1 163 164 #define VS11 Vs22 165 #define VS21 Vs32 166 #define VS31 Vs21 167 #define VS22 Vs33 168 #define VS32 Vs31 169 #define VS33 Vs11 170 #define VQVVX Vqvvy 171 #define VQVVY Vqvvz 172 #define VQVVZ Vqvvx 173 #define VTMP1 Vtmp2 174 #define VTMP2 Vtmp3 175 #define VTMP3 Vtmp1 176 177 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 178 179 #undef SS11 180 #undef SS21 181 #undef SS31 182 #undef SS22 183 #undef SS32 184 #undef SS33 185 #undef SQVVX 186 #undef SQVVY 187 #undef SQVVZ 188 #undef STMP1 189 #undef STMP2 190 #undef STMP3 191 192 #undef VS11 193 #undef VS21 194 #undef VS31 195 #undef VS22 196 #undef VS32 197 #undef VS33 198 #undef VQVVX 199 #undef VQVVY 200 #undef VQVVZ 201 #undef VTMP1 202 #undef VTMP2 203 #undef VTMP3 204 205 // Third Jacobi conjugation 206 207 #define SS11 Ss33 208 #define SS21 Ss31 209 #define SS31 Ss32 210 #define SS22 Ss11 211 #define SS32 Ss21 212 #define SS33 Ss22 213 #define SQVVX Sqvvz 214 #define SQVVY Sqvvx 215 #define SQVVZ Sqvvy 216 #define STMP1 Stmp3 217 #define STMP2 Stmp1 218 #define STMP3 Stmp2 219 220 #define VS11 Vs33 221 #define VS21 Vs31 222 #define VS31 Vs32 223 #define VS22 Vs11 224 #define VS32 Vs21 225 #define VS33 Vs22 226 #define VQVVX Vqvvz 227 #define VQVVY Vqvvx 228 #define VQVVZ Vqvvy 229 #define VTMP1 Vtmp3 230 #define VTMP2 Vtmp1 231 #define VTMP3 Vtmp2 232 233 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp" 234 235 #undef SS11 236 #undef SS21 237 #undef SS31 238 #undef SS22 239 #undef SS32 240 #undef SS33 241 #undef SQVVX 242 #undef SQVVY 243 #undef SQVVZ 244 #undef STMP1 245 #undef STMP2 246 #undef STMP3 247 248 #undef VS11 249 #undef VS21 250 #undef VS31 251 #undef VS22 252 #undef VS32 253 #undef VS33 254 #undef VQVVX 255 #undef VQVVY 256 #undef VQVVZ 257 #undef VTMP1 258 #undef VTMP2 259 #undef VTMP3 260 } 261 262 #ifdef PRINT_DEBUGGING_OUTPUT 263 #ifdef USE_SCALAR_IMPLEMENTATION 264 std::cout<<"Scalar S ="<<std::endl; 265 std::cout<<std::setw(12)<<Ss11.f<<std::endl; 266 std::cout<<std::setw(12)<<Ss21.f<<" "<<std::setw(12)<<Ss22.f<<std::endl; 267 std::cout<<std::setw(12)<<Ss31.f<<" "<<std::setw(12)<<Ss32.f<<" "<<std::setw(12)<<Ss33.f<<std::endl; 268 #endif 269 #ifdef USE_SSE_IMPLEMENTATION 270 _mm_storeu_ps(buf,Vs11);S11=buf[0]; 271 _mm_storeu_ps(buf,Vs21);S21=buf[0]; 272 _mm_storeu_ps(buf,Vs31);S31=buf[0]; 273 _mm_storeu_ps(buf,Vs22);S22=buf[0]; 274 _mm_storeu_ps(buf,Vs32);S32=buf[0]; 275 _mm_storeu_ps(buf,Vs33);S33=buf[0]; 276 std::cout<<"Vector S ="<<std::endl; 277 std::cout<<std::setw(12)<<S11<<std::endl; 278 std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl; 279 std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl; 280 #endif 281 #ifdef USE_AVX_IMPLEMENTATION 282 _mm256_storeu_ps(buf,Vs11);S11=buf[0]; 283 _mm256_storeu_ps(buf,Vs21);S21=buf[0]; 284 _mm256_storeu_ps(buf,Vs31);S31=buf[0]; 285 _mm256_storeu_ps(buf,Vs22);S22=buf[0]; 286 _mm256_storeu_ps(buf,Vs32);S32=buf[0]; 287 _mm256_storeu_ps(buf,Vs33);S33=buf[0]; 288 std::cout<<"Vector S ="<<std::endl; 289 std::cout<<std::setw(12)<<S11<<std::endl; 290 std::cout<<std::setw(12)<<S21<<" "<<std::setw(12)<<S22<<std::endl; 291 std::cout<<std::setw(12)<<S31<<" "<<std::setw(12)<<S32<<" "<<std::setw(12)<<S33<<std::endl; 292 #endif 293 #endif 294 295 } // End block : Symmetric eigenanalysis 296 297 //########################################################### 298 // Normalize quaternion for matrix V 299 //########################################################### 300 301 #if !defined(USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION) || defined(PERFORM_STRICT_QUATERNION_RENORMALIZATION) 302 303 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);) 304 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);) 305 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 306 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);) 307 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 308 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);) 309 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 310 311 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);) 312 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);) 313 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);) 314 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);) 315 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);) 316 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 317 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);) 318 319 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);) 320 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);) 321 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);) 322 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);) 323 324 #ifdef PRINT_DEBUGGING_OUTPUT 325 #ifdef USE_SCALAR_IMPLEMENTATION 326 std::cout<<"Scalar qV ="<<std::endl; 327 std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl; 328 #endif 329 #ifdef USE_SSE_IMPLEMENTATION 330 _mm_storeu_ps(buf,Vqvs);QVS=buf[0]; 331 _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0]; 332 _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0]; 333 _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0]; 334 std::cout<<"Vector qV ="<<std::endl; 335 std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl; 336 #endif 337 #ifdef USE_AVX_IMPLEMENTATION 338 _mm256_storeu_ps(buf,Vqvs);QVS=buf[0]; 339 _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0]; 340 _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0]; 341 _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0]; 342 std::cout<<"Vector qV ="<<std::endl; 343 std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl; 344 #endif 345 #endif 346 347 #endif 348 349 { // Begin block : Conjugation with V 350 351 //########################################################### 352 // Transform quaternion to matrix V 353 //########################################################### 354 355 #ifndef COMPUTE_V_AS_MATRIX 356 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv11;) 357 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv21;) 358 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv31;) 359 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv12;) 360 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv22;) 361 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv32;) 362 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv13;) 363 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv23;) 364 ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;) ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv33;) 365 #endif 366 367 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);) 368 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvy,Vqvvy);) 369 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvz,Vqvvz);) 370 ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vqvs,Vqvs);) 371 ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv11.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv11,Vtmp1);) 372 ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_sub_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_sub_ps(Vv22,Vtmp2);) 373 ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_add_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_add_ps(Vv33,Vtmp3);) 374 ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_add_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_add_ps(Vv22,Vtmp2);) 375 ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv22,Vtmp3);) 376 ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_add_ps(Vv11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_add_ps(Vv11,Vtmp1);) 377 ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp2);) 378 ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp3);) 379 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vqvvx,Vqvvx);) 380 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vqvvy,Vqvvy);) 381 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vqvvz,Vqvvz);) 382 ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vqvs,Vtmp1);) 383 ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sqvs.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vqvs,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vqvs,Vtmp2);) 384 ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sqvs.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vqvs,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vqvs,Vtmp3);) 385 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vtmp1);) 386 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvz.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvz,Vtmp2);) 387 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvx.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvx,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvx,Vtmp3);) 388 ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Stmp1.f-Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_sub_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_sub_ps(Vtmp1,Vv21);) 389 ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Stmp2.f-Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_sub_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_sub_ps(Vtmp2,Vv32);) 390 ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Stmp3.f-Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_sub_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_sub_ps(Vtmp3,Vv13);) 391 ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Stmp1.f+Sv21.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_add_ps(Vtmp1,Vv21);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_add_ps(Vtmp1,Vv21);) 392 ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Stmp2.f+Sv32.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_add_ps(Vtmp2,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_add_ps(Vtmp2,Vv32);) 393 ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Stmp3.f+Sv13.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_add_ps(Vtmp3,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_add_ps(Vtmp3,Vv13);) 394 395 #ifdef COMPUTE_V_AS_MATRIX 396 #ifdef PRINT_DEBUGGING_OUTPUT 397 #ifdef USE_SCALAR_IMPLEMENTATION 398 std::cout<<"Scalar V ="<<std::endl; 399 std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl; 400 std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl; 401 std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl; 402 #endif 403 #ifdef USE_SSE_IMPLEMENTATION 404 _mm_storeu_ps(buf,Vv11);V11=buf[0]; 405 _mm_storeu_ps(buf,Vv21);V21=buf[0]; 406 _mm_storeu_ps(buf,Vv31);V31=buf[0]; 407 _mm_storeu_ps(buf,Vv12);V12=buf[0]; 408 _mm_storeu_ps(buf,Vv22);V22=buf[0]; 409 _mm_storeu_ps(buf,Vv32);V32=buf[0]; 410 _mm_storeu_ps(buf,Vv13);V13=buf[0]; 411 _mm_storeu_ps(buf,Vv23);V23=buf[0]; 412 _mm_storeu_ps(buf,Vv33);V33=buf[0]; 413 std::cout<<"Vector V ="<<std::endl; 414 std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl; 415 std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl; 416 std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl; 417 #endif 418 #ifdef USE_AVX_IMPLEMENTATION 419 _mm256_storeu_ps(buf,Vv11);V11=buf[0]; 420 _mm256_storeu_ps(buf,Vv21);V21=buf[0]; 421 _mm256_storeu_ps(buf,Vv31);V31=buf[0]; 422 _mm256_storeu_ps(buf,Vv12);V12=buf[0]; 423 _mm256_storeu_ps(buf,Vv22);V22=buf[0]; 424 _mm256_storeu_ps(buf,Vv32);V32=buf[0]; 425 _mm256_storeu_ps(buf,Vv13);V13=buf[0]; 426 _mm256_storeu_ps(buf,Vv23);V23=buf[0]; 427 _mm256_storeu_ps(buf,Vv33);V33=buf[0]; 428 std::cout<<"Vector V ="<<std::endl; 429 std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl; 430 std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl; 431 std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl; 432 #endif 433 #endif 434 #endif 435 436 //########################################################### 437 // Multiply (from the right) with V 438 //########################################################### 439 440 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va12;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va12;) 441 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va13;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va13;) 442 ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sv12.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Vv12,Va11);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Vv12,Va11);) 443 ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sv13.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Vv13,Va11);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Vv13,Va11);) 444 ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sv11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Vv11,Va11);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Vv11,Va11);) 445 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);) 446 ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);) 447 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);) 448 ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);) 449 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);) 450 ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);) 451 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);) 452 ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);) 453 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);) 454 ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);) 455 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);) 456 ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);) 457 458 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va22;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va22;) 459 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va23;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va23;) 460 ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sv12.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Vv12,Va21);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Vv12,Va21);) 461 ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sv13.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Vv13,Va21);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Vv13,Va21);) 462 ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sv11.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Vv11,Va21);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Vv11,Va21);) 463 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);) 464 ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);) 465 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);) 466 ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);) 467 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);) 468 ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);) 469 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);) 470 ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);) 471 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);) 472 ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);) 473 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);) 474 ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);) 475 476 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va32;) ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va32;) 477 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va33;) ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va33;) 478 ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sv12.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Vv12,Va31);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Vv12,Va31);) 479 ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sv13.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Vv13,Va31);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Vv13,Va31);) 480 ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sv11.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Vv11,Va31);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Vv11,Va31);) 481 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);) 482 ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);) 483 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);) 484 ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);) 485 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);) 486 ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);) 487 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);) 488 ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);) 489 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);) 490 ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);) 491 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);) 492 ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);) 493 494 #ifdef PRINT_DEBUGGING_OUTPUT 495 #ifdef USE_SCALAR_IMPLEMENTATION 496 std::cout<<"Scalar A (after multiplying with V) ="<<std::endl; 497 std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl; 498 std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl; 499 std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl; 500 #endif 501 #ifdef USE_SSE_IMPLEMENTATION 502 _mm_storeu_ps(buf,Va11);A11=buf[0]; 503 _mm_storeu_ps(buf,Va21);A21=buf[0]; 504 _mm_storeu_ps(buf,Va31);A31=buf[0]; 505 _mm_storeu_ps(buf,Va12);A12=buf[0]; 506 _mm_storeu_ps(buf,Va22);A22=buf[0]; 507 _mm_storeu_ps(buf,Va32);A32=buf[0]; 508 _mm_storeu_ps(buf,Va13);A13=buf[0]; 509 _mm_storeu_ps(buf,Va23);A23=buf[0]; 510 _mm_storeu_ps(buf,Va33);A33=buf[0]; 511 std::cout<<"Vector A (after multiplying with V) ="<<std::endl; 512 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 513 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 514 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 515 #endif 516 #ifdef USE_AVX_IMPLEMENTATION 517 _mm256_storeu_ps(buf,Va11);A11=buf[0]; 518 _mm256_storeu_ps(buf,Va21);A21=buf[0]; 519 _mm256_storeu_ps(buf,Va31);A31=buf[0]; 520 _mm256_storeu_ps(buf,Va12);A12=buf[0]; 521 _mm256_storeu_ps(buf,Va22);A22=buf[0]; 522 _mm256_storeu_ps(buf,Va32);A32=buf[0]; 523 _mm256_storeu_ps(buf,Va13);A13=buf[0]; 524 _mm256_storeu_ps(buf,Va23);A23=buf[0]; 525 _mm256_storeu_ps(buf,Va33);A33=buf[0]; 526 std::cout<<"Vector A (after multiplying with V) ="<<std::endl; 527 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 528 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 529 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 530 #endif 531 #endif 532 533 } // End block : Conjugation with V 534 535 } // End block : Scope of qV (if not maintained) 536 537 //########################################################### 538 // Permute columns such that the singular values are sorted 539 //########################################################### 540 541 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa11.f*Sa11.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va11,Va11);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va11,Va11);) 542 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa21.f*Sa21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va21,Va21);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va21,Va21);) 543 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 544 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa31.f*Sa31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va31,Va31);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va31,Va31);) 545 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 546 547 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f*Sa12.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Va12,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Va12,Va12);) 548 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa22.f*Sa22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va22,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va22,Va22);) 549 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);) 550 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa32.f*Sa32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va32,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va32,Va32);) 551 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);) 552 553 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f*Sa13.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Va13,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Va13,Va13);) 554 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa23.f*Sa23.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va23,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va23,Va23);) 555 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);) 556 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa33.f*Sa33.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va33,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va33,Va33);) 557 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);) 558 559 // Swap columns 1-2 if necessary 560 561 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp2.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp2, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp2);) 562 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va12);) 563 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 564 ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);) 565 ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);) 566 567 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va22);) 568 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 569 ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);) 570 ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);) 571 572 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va32);) 573 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 574 ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);) 575 ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);) 576 577 #ifdef COMPUTE_V_AS_MATRIX 578 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv12.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv12);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv12);) 579 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 580 ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);) 581 ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);) 582 583 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv22.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv22);) 584 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 585 ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);) 586 ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);) 587 588 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv32.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv32);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv32);) 589 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 590 ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);) 591 ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);) 592 #endif 593 594 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp2);) 595 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 596 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);) 597 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);) 598 599 // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation 600 601 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);) 602 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 603 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;) 604 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);) 605 606 ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Va12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Va12,Vtmp4);) 607 ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Va22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Va22,Vtmp4);) 608 ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Va32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Va32,Vtmp4);) 609 610 #ifdef COMPUTE_V_AS_MATRIX 611 ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Sv12.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_mul_ps(Vv12,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_mul_ps(Vv12,Vtmp4);) 612 ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_mul_ps(Vv22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_mul_ps(Vv22,Vtmp4);) 613 ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sv32.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vv32,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vv32,Vtmp4);) 614 #endif 615 616 // If columns 1-2 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this) 617 618 #ifdef COMPUTE_V_AS_QUATERNION 619 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);) 620 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);) 621 622 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);) 623 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);) 624 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);) 625 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvs);) 626 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;) 627 628 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);) 629 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvy);) 630 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp4);) 631 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvvy);) 632 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvy=Vtmp5;) 633 #endif 634 635 // Swap columns 1-3 if necessary 636 637 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp3);) 638 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va13);) 639 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 640 ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);) 641 ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);) 642 643 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va23);) 644 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 645 ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);) 646 ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);) 647 648 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va33);) 649 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 650 ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);) 651 ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);) 652 653 #ifdef COMPUTE_V_AS_MATRIX 654 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv13);) 655 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 656 ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);) 657 ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);) 658 659 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv23);) 660 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 661 ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);) 662 ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);) 663 664 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv33);) 665 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 666 ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);) 667 ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);) 668 #endif 669 670 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp3);) 671 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 672 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);) 673 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);) 674 675 // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation 676 677 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);) 678 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 679 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;) 680 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);) 681 682 ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Va11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Va11,Vtmp4);) 683 ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Va21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Va21,Vtmp4);) 684 ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Va31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Va31,Vtmp4);) 685 686 #ifdef COMPUTE_V_AS_MATRIX 687 ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vv11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vv11,Vtmp4);) 688 ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sv21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vv21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vv21,Vtmp4);) 689 ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Sv31.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_mul_ps(Vv31,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_mul_ps(Vv31,Vtmp4);) 690 #endif 691 692 // If columns 1-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this) 693 694 #ifdef COMPUTE_V_AS_QUATERNION 695 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);) 696 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);) 697 698 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);) 699 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);) 700 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);) 701 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvs);) 702 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;) 703 704 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);) 705 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvx);) 706 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp4);) 707 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvvx);) 708 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvx=Vtmp5;) 709 #endif 710 711 // Swap columns 2-3 if necessary 712 713 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp2.f<Stmp3.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp2,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp2,Vtmp3);) 714 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa12.ui^Sa13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va12,Va13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va12,Va13);) 715 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 716 ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);) 717 ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);) 718 719 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa22.ui^Sa23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va22,Va23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va22,Va23);) 720 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 721 ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);) 722 ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);) 723 724 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa32.ui^Sa33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va32,Va33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va32,Va33);) 725 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 726 ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);) 727 ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);) 728 729 #ifdef COMPUTE_V_AS_MATRIX 730 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv12.ui^Sv13.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv12,Vv13);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv12,Vv13);) 731 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 732 ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);) 733 ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);) 734 735 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv22.ui^Sv23.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv22,Vv23);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv22,Vv23);) 736 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 737 ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);) 738 ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);) 739 740 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv32.ui^Sv33.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv32,Vv33);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv32,Vv33);) 741 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 742 ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);) 743 ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);) 744 #endif 745 746 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp2.ui^Stmp3.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp2,Vtmp3);) 747 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 748 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);) 749 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);) 750 751 // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation 752 753 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);) 754 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);) 755 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;) ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;) 756 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);) 757 758 ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Va13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Va13,Vtmp4);) 759 ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Va23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Va23,Vtmp4);) 760 ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Va33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Va33,Vtmp4);) 761 762 #ifdef COMPUTE_V_AS_MATRIX 763 ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sv13.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vv13,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vv13,Vtmp4);) 764 ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Sv23.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_mul_ps(Vv23,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_mul_ps(Vv23,Vtmp4);) 765 ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_mul_ps(Vv33,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_mul_ps(Vv33,Vtmp4);) 766 #endif 767 768 // If columns 2-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this) 769 770 #ifdef COMPUTE_V_AS_QUATERNION 771 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);) 772 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);) 773 774 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);) 775 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);) 776 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);) 777 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvs);) 778 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;) 779 780 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);) 781 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvz);) 782 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp4);) 783 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvvz);) 784 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=Vtmp5;) ENABLE_AVX_IMPLEMENTATION(Vqvvz=Vtmp5;) 785 #endif 786 787 #ifdef COMPUTE_V_AS_MATRIX 788 #ifdef PRINT_DEBUGGING_OUTPUT 789 #ifdef USE_SCALAR_IMPLEMENTATION 790 std::cout<<"Scalar V ="<<std::endl; 791 std::cout<<std::setw(12)<<Sv11.f<<" "<<std::setw(12)<<Sv12.f<<" "<<std::setw(12)<<Sv13.f<<std::endl; 792 std::cout<<std::setw(12)<<Sv21.f<<" "<<std::setw(12)<<Sv22.f<<" "<<std::setw(12)<<Sv23.f<<std::endl; 793 std::cout<<std::setw(12)<<Sv31.f<<" "<<std::setw(12)<<Sv32.f<<" "<<std::setw(12)<<Sv33.f<<std::endl; 794 #endif 795 #ifdef USE_SSE_IMPLEMENTATION 796 _mm_storeu_ps(buf,Vv11);V11=buf[0]; 797 _mm_storeu_ps(buf,Vv21);V21=buf[0]; 798 _mm_storeu_ps(buf,Vv31);V31=buf[0]; 799 _mm_storeu_ps(buf,Vv12);V12=buf[0]; 800 _mm_storeu_ps(buf,Vv22);V22=buf[0]; 801 _mm_storeu_ps(buf,Vv32);V32=buf[0]; 802 _mm_storeu_ps(buf,Vv13);V13=buf[0]; 803 _mm_storeu_ps(buf,Vv23);V23=buf[0]; 804 _mm_storeu_ps(buf,Vv33);V33=buf[0]; 805 std::cout<<"Vector V ="<<std::endl; 806 std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl; 807 std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl; 808 std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl; 809 #endif 810 #ifdef USE_AVX_IMPLEMENTATION 811 _mm256_storeu_ps(buf,Vv11);V11=buf[0]; 812 _mm256_storeu_ps(buf,Vv21);V21=buf[0]; 813 _mm256_storeu_ps(buf,Vv31);V31=buf[0]; 814 _mm256_storeu_ps(buf,Vv12);V12=buf[0]; 815 _mm256_storeu_ps(buf,Vv22);V22=buf[0]; 816 _mm256_storeu_ps(buf,Vv32);V32=buf[0]; 817 _mm256_storeu_ps(buf,Vv13);V13=buf[0]; 818 _mm256_storeu_ps(buf,Vv23);V23=buf[0]; 819 _mm256_storeu_ps(buf,Vv33);V33=buf[0]; 820 std::cout<<"Vector V ="<<std::endl; 821 std::cout<<std::setw(12)<<V11<<" "<<std::setw(12)<<V12<<" "<<std::setw(12)<<V13<<std::endl; 822 std::cout<<std::setw(12)<<V21<<" "<<std::setw(12)<<V22<<" "<<std::setw(12)<<V23<<std::endl; 823 std::cout<<std::setw(12)<<V31<<" "<<std::setw(12)<<V32<<" "<<std::setw(12)<<V33<<std::endl; 824 #endif 825 #endif 826 #endif 827 828 #ifdef PRINT_DEBUGGING_OUTPUT 829 #ifdef USE_SCALAR_IMPLEMENTATION 830 std::cout<<"Scalar A (after multiplying with V) ="<<std::endl; 831 std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl; 832 std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl; 833 std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl; 834 #endif 835 #ifdef USE_SSE_IMPLEMENTATION 836 _mm_storeu_ps(buf,Va11);A11=buf[0]; 837 _mm_storeu_ps(buf,Va21);A21=buf[0]; 838 _mm_storeu_ps(buf,Va31);A31=buf[0]; 839 _mm_storeu_ps(buf,Va12);A12=buf[0]; 840 _mm_storeu_ps(buf,Va22);A22=buf[0]; 841 _mm_storeu_ps(buf,Va32);A32=buf[0]; 842 _mm_storeu_ps(buf,Va13);A13=buf[0]; 843 _mm_storeu_ps(buf,Va23);A23=buf[0]; 844 _mm_storeu_ps(buf,Va33);A33=buf[0]; 845 std::cout<<"Vector A (after multiplying with V) ="<<std::endl; 846 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 847 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 848 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 849 #endif 850 #ifdef USE_AVX_IMPLEMENTATION 851 _mm256_storeu_ps(buf,Va11);A11=buf[0]; 852 _mm256_storeu_ps(buf,Va21);A21=buf[0]; 853 _mm256_storeu_ps(buf,Va31);A31=buf[0]; 854 _mm256_storeu_ps(buf,Va12);A12=buf[0]; 855 _mm256_storeu_ps(buf,Va22);A22=buf[0]; 856 _mm256_storeu_ps(buf,Va32);A32=buf[0]; 857 _mm256_storeu_ps(buf,Va13);A13=buf[0]; 858 _mm256_storeu_ps(buf,Va23);A23=buf[0]; 859 _mm256_storeu_ps(buf,Va33);A33=buf[0]; 860 std::cout<<"Vector A (after multiplying with V) ="<<std::endl; 861 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 862 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 863 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 864 #endif 865 #endif 866 867 //########################################################### 868 // Re-normalize quaternion for matrix V 869 //########################################################### 870 871 #ifdef COMPUTE_V_AS_QUATERNION 872 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);) 873 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);) 874 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 875 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);) 876 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 877 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);) 878 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);) 879 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);) 880 881 #ifdef PERFORM_STRICT_QUATERNION_RENORMALIZATION 882 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);) 883 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);) 884 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);) 885 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);) 886 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);) 887 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);) 888 #endif 889 890 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);) 891 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);) 892 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);) 893 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);) 894 895 #ifdef PRINT_DEBUGGING_OUTPUT 896 #ifdef USE_SCALAR_IMPLEMENTATION 897 std::cout<<"Scalar qV ="<<std::endl; 898 std::cout<<std::setw(12)<<Sqvs.f<<" "<<std::setw(12)<<Sqvvx.f<<" "<<std::setw(12)<<Sqvvy.f<<" "<<std::setw(12)<<Sqvvz.f<<std::endl; 899 #endif 900 #ifdef USE_SSE_IMPLEMENTATION 901 _mm_storeu_ps(buf,Vqvs);QVS=buf[0]; 902 _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0]; 903 _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0]; 904 _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0]; 905 std::cout<<"Vector qV ="<<std::endl; 906 std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl; 907 #endif 908 #ifdef USE_AVX_IMPLEMENTATION 909 _mm256_storeu_ps(buf,Vqvs);QVS=buf[0]; 910 _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0]; 911 _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0]; 912 _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0]; 913 std::cout<<"Vector qV ="<<std::endl; 914 std::cout<<std::setw(12)<<QVS<<" "<<std::setw(12)<<QVVX<<" "<<std::setw(12)<<QVVY<<" "<<std::setw(12)<<QVVZ<<std::endl; 915 #endif 916 #endif 917 #endif 918 919 //########################################################### 920 // Construct QR factorization of A*V (=U*D) using Givens rotations 921 //########################################################### 922 923 #ifdef COMPUTE_U_AS_MATRIX 924 ENABLE_SCALAR_IMPLEMENTATION(Su11.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu11=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu11=Vone;) 925 ENABLE_SCALAR_IMPLEMENTATION(Su21.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu21=_mm_xor_ps(Vu21,Vu21);) ENABLE_AVX_IMPLEMENTATION(Vu21=_mm256_xor_ps(Vu21,Vu21);) 926 ENABLE_SCALAR_IMPLEMENTATION(Su31.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu31=_mm_xor_ps(Vu31,Vu31);) ENABLE_AVX_IMPLEMENTATION(Vu31=_mm256_xor_ps(Vu31,Vu31);) 927 ENABLE_SCALAR_IMPLEMENTATION(Su12.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu12=_mm_xor_ps(Vu12,Vu12);) ENABLE_AVX_IMPLEMENTATION(Vu12=_mm256_xor_ps(Vu12,Vu12);) 928 ENABLE_SCALAR_IMPLEMENTATION(Su22.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu22=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu22=Vone;) 929 ENABLE_SCALAR_IMPLEMENTATION(Su32.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu32=_mm_xor_ps(Vu32,Vu32);) ENABLE_AVX_IMPLEMENTATION(Vu32=_mm256_xor_ps(Vu32,Vu32);) 930 ENABLE_SCALAR_IMPLEMENTATION(Su13.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu13=_mm_xor_ps(Vu13,Vu13);) ENABLE_AVX_IMPLEMENTATION(Vu13=_mm256_xor_ps(Vu13,Vu13);) 931 ENABLE_SCALAR_IMPLEMENTATION(Su23.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vu23=_mm_xor_ps(Vu23,Vu23);) ENABLE_AVX_IMPLEMENTATION(Vu23=_mm256_xor_ps(Vu23,Vu23);) 932 ENABLE_SCALAR_IMPLEMENTATION(Su33.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vu33=Vone;) ENABLE_AVX_IMPLEMENTATION(Vu33=Vone;) 933 #endif 934 935 #ifdef COMPUTE_U_AS_QUATERNION 936 ENABLE_SCALAR_IMPLEMENTATION(Squs.f=1.;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vone;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vone;) 937 ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);) 938 ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);) 939 ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_xor_ps(Vquvz,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_xor_ps(Vquvz,Vquvz);) 940 #endif 941 942 // First Givens rotation 943 944 #define SAPIVOT Sa11 945 #define SANPIVOT Sa21 946 #define SA11 Sa11 947 #define SA21 Sa21 948 #define SA12 Sa12 949 #define SA22 Sa22 950 #define SA13 Sa13 951 #define SA23 Sa23 952 #define SU11 Su11 953 #define SU12 Su12 954 #define SU21 Su21 955 #define SU22 Su22 956 #define SU31 Su31 957 #define SU32 Su32 958 959 #define VAPIVOT Va11 960 #define VANPIVOT Va21 961 #define VA11 Va11 962 #define VA21 Va21 963 #define VA12 Va12 964 #define VA22 Va22 965 #define VA13 Va13 966 #define VA23 Va23 967 #define VU11 Vu11 968 #define VU12 Vu12 969 #define VU21 Vu21 970 #define VU22 Vu22 971 #define VU31 Vu31 972 #define VU32 Vu32 973 974 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp" 975 976 #undef SAPIVOT 977 #undef SANPIVOT 978 #undef SA11 979 #undef SA21 980 #undef SA12 981 #undef SA22 982 #undef SA13 983 #undef SA23 984 #undef SU11 985 #undef SU12 986 #undef SU21 987 #undef SU22 988 #undef SU31 989 #undef SU32 990 991 #undef VAPIVOT 992 #undef VANPIVOT 993 #undef VA11 994 #undef VA21 995 #undef VA12 996 #undef VA22 997 #undef VA13 998 #undef VA23 999 #undef VU11 1000 #undef VU12 1001 #undef VU21 1002 #undef VU22 1003 #undef VU31 1004 #undef VU32 1005 1006 // Update quaternion representation of U 1007 1008 #ifdef COMPUTE_U_AS_QUATERNION 1009 ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=Vch;) ENABLE_AVX_IMPLEMENTATION(Vqus=Vch;) 1010 ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);) 1011 ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);) 1012 ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=Vsh;) ENABLE_AVX_IMPLEMENTATION(Vquvz=Vsh;) 1013 #endif 1014 1015 // Second Givens rotation 1016 1017 #define SAPIVOT Sa11 1018 #define SANPIVOT Sa31 1019 #define SA11 Sa11 1020 #define SA21 Sa31 1021 #define SA12 Sa12 1022 #define SA22 Sa32 1023 #define SA13 Sa13 1024 #define SA23 Sa33 1025 #define SU11 Su11 1026 #define SU12 Su13 1027 #define SU21 Su21 1028 #define SU22 Su23 1029 #define SU31 Su31 1030 #define SU32 Su33 1031 1032 #define VAPIVOT Va11 1033 #define VANPIVOT Va31 1034 #define VA11 Va11 1035 #define VA21 Va31 1036 #define VA12 Va12 1037 #define VA22 Va32 1038 #define VA13 Va13 1039 #define VA23 Va33 1040 #define VU11 Vu11 1041 #define VU12 Vu13 1042 #define VU21 Vu21 1043 #define VU22 Vu23 1044 #define VU31 Vu31 1045 #define VU32 Vu33 1046 1047 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp" 1048 1049 #undef SAPIVOT 1050 #undef SANPIVOT 1051 #undef SA11 1052 #undef SA21 1053 #undef SA12 1054 #undef SA22 1055 #undef SA13 1056 #undef SA23 1057 #undef SU11 1058 #undef SU12 1059 #undef SU21 1060 #undef SU22 1061 #undef SU31 1062 #undef SU32 1063 1064 #undef VAPIVOT 1065 #undef VANPIVOT 1066 #undef VA11 1067 #undef VA21 1068 #undef VA12 1069 #undef VA22 1070 #undef VA13 1071 #undef VA23 1072 #undef VU11 1073 #undef VU12 1074 #undef VU21 1075 #undef VU22 1076 #undef VU31 1077 #undef VU32 1078 1079 // Update quaternion representation of U 1080 1081 #ifdef COMPUTE_U_AS_QUATERNION 1082 ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vsh,Vquvz);) 1083 ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);) 1084 ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f-Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_sub_ps(Vquvy,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_sub_ps(Vquvy,Vsh);) 1085 ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);) 1086 ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);) 1087 #endif 1088 1089 // Third Givens rotation 1090 1091 #define SAPIVOT Sa22 1092 #define SANPIVOT Sa32 1093 #define SA11 Sa21 1094 #define SA21 Sa31 1095 #define SA12 Sa22 1096 #define SA22 Sa32 1097 #define SA13 Sa23 1098 #define SA23 Sa33 1099 #define SU11 Su12 1100 #define SU12 Su13 1101 #define SU21 Su22 1102 #define SU22 Su23 1103 #define SU31 Su32 1104 #define SU32 Su33 1105 1106 #define VAPIVOT Va22 1107 #define VANPIVOT Va32 1108 #define VA11 Va21 1109 #define VA21 Va31 1110 #define VA12 Va22 1111 #define VA22 Va32 1112 #define VA13 Va23 1113 #define VA23 Va33 1114 #define VU11 Vu12 1115 #define VU12 Vu13 1116 #define VU21 Vu22 1117 #define VU22 Vu23 1118 #define VU31 Vu32 1119 #define VU32 Vu33 1120 1121 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp" 1122 1123 #undef SAPIVOT 1124 #undef SANPIVOT 1125 #undef SA11 1126 #undef SA21 1127 #undef SA12 1128 #undef SA22 1129 #undef SA13 1130 #undef SA23 1131 #undef SU11 1132 #undef SU12 1133 #undef SU21 1134 #undef SU22 1135 #undef SU31 1136 #undef SU32 1137 1138 #undef VAPIVOT 1139 #undef VANPIVOT 1140 #undef VA11 1141 #undef VA21 1142 #undef VA12 1143 #undef VA22 1144 #undef VA13 1145 #undef VA23 1146 #undef VU11 1147 #undef VU12 1148 #undef VU21 1149 #undef VU22 1150 #undef VU31 1151 #undef VU32 1152 1153 // Update quaternion representation of U 1154 1155 #ifdef COMPUTE_U_AS_QUATERNION 1156 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vquvx);) 1157 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vquvy);) 1158 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vquvz);) 1159 ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);) 1160 ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);) 1161 ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Sch.f*Squvx.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vch,Vquvx);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vch,Vquvx);) 1162 ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Sch.f*Squvy.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_mul_ps(Vch,Vquvy);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_mul_ps(Vch,Vquvy);) 1163 ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);) 1164 ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Squvx.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_add_ps(Vquvx,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_add_ps(Vquvx,Vsh);) 1165 ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Squs.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_sub_ps(Vqus,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_sub_ps(Vqus,Vtmp1);) 1166 ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_add_ps(Vquvy,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_add_ps(Vquvy,Vtmp3);) 1167 ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Squvz.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_sub_ps(Vquvz,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_sub_ps(Vquvz,Vtmp2);) 1168 #endif 1169 1170 #ifdef COMPUTE_U_AS_MATRIX 1171 #ifdef PRINT_DEBUGGING_OUTPUT 1172 #ifdef USE_SCALAR_IMPLEMENTATION 1173 std::cout<<"Scalar U ="<<std::endl; 1174 std::cout<<std::setw(12)<<Su11.f<<" "<<std::setw(12)<<Su12.f<<" "<<std::setw(12)<<Su13.f<<std::endl; 1175 std::cout<<std::setw(12)<<Su21.f<<" "<<std::setw(12)<<Su22.f<<" "<<std::setw(12)<<Su23.f<<std::endl; 1176 std::cout<<std::setw(12)<<Su31.f<<" "<<std::setw(12)<<Su32.f<<" "<<std::setw(12)<<Su33.f<<std::endl; 1177 #endif 1178 #ifdef USE_SSE_IMPLEMENTATION 1179 _mm_storeu_ps(buf,Vu11);U11=buf[0]; 1180 _mm_storeu_ps(buf,Vu21);U21=buf[0]; 1181 _mm_storeu_ps(buf,Vu31);U31=buf[0]; 1182 _mm_storeu_ps(buf,Vu12);U12=buf[0]; 1183 _mm_storeu_ps(buf,Vu22);U22=buf[0]; 1184 _mm_storeu_ps(buf,Vu32);U32=buf[0]; 1185 _mm_storeu_ps(buf,Vu13);U13=buf[0]; 1186 _mm_storeu_ps(buf,Vu23);U23=buf[0]; 1187 _mm_storeu_ps(buf,Vu33);U33=buf[0]; 1188 std::cout<<"Vector U ="<<std::endl; 1189 std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl; 1190 std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl; 1191 std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl; 1192 #endif 1193 #ifdef USE_AVX_IMPLEMENTATION 1194 _mm256_storeu_ps(buf,Vu11);U11=buf[0]; 1195 _mm256_storeu_ps(buf,Vu21);U21=buf[0]; 1196 _mm256_storeu_ps(buf,Vu31);U31=buf[0]; 1197 _mm256_storeu_ps(buf,Vu12);U12=buf[0]; 1198 _mm256_storeu_ps(buf,Vu22);U22=buf[0]; 1199 _mm256_storeu_ps(buf,Vu32);U32=buf[0]; 1200 _mm256_storeu_ps(buf,Vu13);U13=buf[0]; 1201 _mm256_storeu_ps(buf,Vu23);U23=buf[0]; 1202 _mm256_storeu_ps(buf,Vu33);U33=buf[0]; 1203 std::cout<<"Vector U ="<<std::endl; 1204 std::cout<<std::setw(12)<<U11<<" "<<std::setw(12)<<U12<<" "<<std::setw(12)<<U13<<std::endl; 1205 std::cout<<std::setw(12)<<U21<<" "<<std::setw(12)<<U22<<" "<<std::setw(12)<<U23<<std::endl; 1206 std::cout<<std::setw(12)<<U31<<" "<<std::setw(12)<<U32<<" "<<std::setw(12)<<U33<<std::endl; 1207 #endif 1208 #endif 1209 #endif 1210 1211 #ifdef PRINT_DEBUGGING_OUTPUT 1212 #ifdef USE_SCALAR_IMPLEMENTATION 1213 std::cout<<"Scalar A (after multiplying with U-transpose and V) ="<<std::endl; 1214 std::cout<<std::setw(12)<<Sa11.f<<" "<<std::setw(12)<<Sa12.f<<" "<<std::setw(12)<<Sa13.f<<std::endl; 1215 std::cout<<std::setw(12)<<Sa21.f<<" "<<std::setw(12)<<Sa22.f<<" "<<std::setw(12)<<Sa23.f<<std::endl; 1216 std::cout<<std::setw(12)<<Sa31.f<<" "<<std::setw(12)<<Sa32.f<<" "<<std::setw(12)<<Sa33.f<<std::endl; 1217 #endif 1218 #ifdef USE_SSE_IMPLEMENTATION 1219 _mm_storeu_ps(buf,Va11);A11=buf[0]; 1220 _mm_storeu_ps(buf,Va21);A21=buf[0]; 1221 _mm_storeu_ps(buf,Va31);A31=buf[0]; 1222 _mm_storeu_ps(buf,Va12);A12=buf[0]; 1223 _mm_storeu_ps(buf,Va22);A22=buf[0]; 1224 _mm_storeu_ps(buf,Va32);A32=buf[0]; 1225 _mm_storeu_ps(buf,Va13);A13=buf[0]; 1226 _mm_storeu_ps(buf,Va23);A23=buf[0]; 1227 _mm_storeu_ps(buf,Va33);A33=buf[0]; 1228 std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl; 1229 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 1230 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 1231 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 1232 #endif 1233 #ifdef USE_AVX_IMPLEMENTATION 1234 _mm256_storeu_ps(buf,Va11);A11=buf[0]; 1235 _mm256_storeu_ps(buf,Va21);A21=buf[0]; 1236 _mm256_storeu_ps(buf,Va31);A31=buf[0]; 1237 _mm256_storeu_ps(buf,Va12);A12=buf[0]; 1238 _mm256_storeu_ps(buf,Va22);A22=buf[0]; 1239 _mm256_storeu_ps(buf,Va32);A32=buf[0]; 1240 _mm256_storeu_ps(buf,Va13);A13=buf[0]; 1241 _mm256_storeu_ps(buf,Va23);A23=buf[0]; 1242 _mm256_storeu_ps(buf,Va33);A33=buf[0]; 1243 std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl; 1244 std::cout<<std::setw(12)<<A11<<" "<<std::setw(12)<<A12<<" "<<std::setw(12)<<A13<<std::endl; 1245 std::cout<<std::setw(12)<<A21<<" "<<std::setw(12)<<A22<<" "<<std::setw(12)<<A23<<std::endl; 1246 std::cout<<std::setw(12)<<A31<<" "<<std::setw(12)<<A32<<" "<<std::setw(12)<<A33<<std::endl; 1247 #endif 1248 #endif 1249 1250 #ifdef COMPUTE_U_AS_QUATERNION 1251 #ifdef PRINT_DEBUGGING_OUTPUT 1252 #ifdef USE_SCALAR_IMPLEMENTATION 1253 std::cout<<"Scalar qU ="<<std::endl; 1254 std::cout<<std::setw(12)<<Squs.f<<" "<<std::setw(12)<<Squvx.f<<" "<<std::setw(12)<<Squvy.f<<" "<<std::setw(12)<<Squvz.f<<std::endl; 1255 #endif 1256 #ifdef USE_SSE_IMPLEMENTATION 1257 _mm_storeu_ps(buf,Vqus);QUS=buf[0]; 1258 _mm_storeu_ps(buf,Vquvx);QUVX=buf[0]; 1259 _mm_storeu_ps(buf,Vquvy);QUVY=buf[0]; 1260 _mm_storeu_ps(buf,Vquvz);QUVZ=buf[0]; 1261 std::cout<<"Vector qU ="<<std::endl; 1262 std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl; 1263 #endif 1264 #ifdef USE_AVX_IMPLEMENTATION 1265 _mm256_storeu_ps(buf,Vqus);QUS=buf[0]; 1266 _mm256_storeu_ps(buf,Vquvx);QUVX=buf[0]; 1267 _mm256_storeu_ps(buf,Vquvy);QUVY=buf[0]; 1268 _mm256_storeu_ps(buf,Vquvz);QUVZ=buf[0]; 1269 std::cout<<"Vector qU ="<<std::endl; 1270 std::cout<<std::setw(12)<<QUS<<" "<<std::setw(12)<<QUVX<<" "<<std::setw(12)<<QUVY<<" "<<std::setw(12)<<QUVZ<<std::endl; 1271 #endif 1272 #endif 1273 #endif 1274 1275 #ifdef __INTEL_COMPILER 1276 #pragma warning( default : 592 ) 1277 #endif 1278