1 //#####################################################################
2 // Copyright (c) 2010-2011, Eftychios Sifakis.
3 //
4 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met:
5 //   * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer.
6 //   * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or
7 //     other materials provided with the distribution.
8 //
9 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING,
10 // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT
11 // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
12 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
13 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
14 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
15 //#####################################################################
16 
17 #ifdef __INTEL_COMPILER
18 #pragma warning( disable : 592 )
19 #endif
20 
21 // #define USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION
22 // #define PERFORM_STRICT_QUATERNION_RENORMALIZATION
23 
24 { // Begin block : Scope of qV (if not maintained)
25 
26 #ifndef COMPUTE_V_AS_QUATERNION
ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;)27     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvs;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vqvs;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vqvs;)
28     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvx;)                 ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvx;)                                                  ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvx;)
29     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvy;)                 ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvy;)                                                  ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvy;)
30     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sqvvz;)                 ENABLE_SSE_IMPLEMENTATION(__m128 Vqvvz;)                                                  ENABLE_AVX_IMPLEMENTATION(__m256 Vqvvz;)
31 #endif
32 
33 { // Begin block : Symmetric eigenanalysis
34 
35     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss11;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs11;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs11;)
36     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss21;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs21;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs21;)
37     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss31;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs31;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs31;)
38     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss22;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs22;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs22;)
39     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss32;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs32;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs32;)
40     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Ss33;)                  ENABLE_SSE_IMPLEMENTATION(__m128 Vs33;)                                                   ENABLE_AVX_IMPLEMENTATION(__m256 Vs33;)
41 
42     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=1.;)                                              ENABLE_SSE_IMPLEMENTATION(Vqvs=Vone;)                                                     ENABLE_AVX_IMPLEMENTATION(Vqvs=Vone;)
43     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_xor_ps(Vqvvx,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_xor_ps(Vqvvx,Vqvvx);)
44     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_xor_ps(Vqvvy,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_xor_ps(Vqvvy,Vqvvy);)
45     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_xor_ps(Vqvvz,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_xor_ps(Vqvvz,Vqvvz);)
46 
47     //###########################################################
48     // Compute normal equations matrix
49     //###########################################################
50 
51     ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Sa11.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_mul_ps(Va11,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_mul_ps(Va11,Va11);)
52     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa21.f*Sa21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va21,Va21);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va21,Va21);)
53     ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);)                                   ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
54     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa31.f*Sa31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va31,Va31);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va31,Va31);)
55     ENABLE_SCALAR_IMPLEMENTATION(Ss11.f=Stmp1.f+Ss11.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs11=_mm_add_ps(Vtmp1,Vs11);)                                   ENABLE_AVX_IMPLEMENTATION(Vs11=_mm256_add_ps(Vtmp1,Vs11);)
56 
57     ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Sa12.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_mul_ps(Va12,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_mul_ps(Va12,Va11);)
58     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va21);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va21);)
59     ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);)                                   ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
60     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va31);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va31);)
61     ENABLE_SCALAR_IMPLEMENTATION(Ss21.f=Stmp1.f+Ss21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs21=_mm_add_ps(Vtmp1,Vs21);)                                   ENABLE_AVX_IMPLEMENTATION(Vs21=_mm256_add_ps(Vtmp1,Vs21);)
62 
63     ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Sa13.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_mul_ps(Va13,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_mul_ps(Va13,Va11);)
64     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va21);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va21);)
65     ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);)                                   ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
66     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va31);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va31);)
67     ENABLE_SCALAR_IMPLEMENTATION(Ss31.f=Stmp1.f+Ss31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs31=_mm_add_ps(Vtmp1,Vs31);)                                   ENABLE_AVX_IMPLEMENTATION(Vs31=_mm256_add_ps(Vtmp1,Vs31);)
68 
69     ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Sa12.f*Sa12.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_mul_ps(Va12,Va12);)                                    ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_mul_ps(Va12,Va12);)
70     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa22.f*Sa22.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va22,Va22);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va22,Va22);)
71     ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);)                                   ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
72     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa32.f*Sa32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va32,Va32);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va32,Va32);)
73     ENABLE_SCALAR_IMPLEMENTATION(Ss22.f=Stmp1.f+Ss22.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs22=_mm_add_ps(Vtmp1,Vs22);)                                   ENABLE_AVX_IMPLEMENTATION(Vs22=_mm256_add_ps(Vtmp1,Vs22);)
74 
75     ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Sa13.f*Sa12.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_mul_ps(Va13,Va12);)                                    ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_mul_ps(Va13,Va12);)
76     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa22.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va22);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va22);)
77     ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);)                                   ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
78     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va32);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va32);)
79     ENABLE_SCALAR_IMPLEMENTATION(Ss32.f=Stmp1.f+Ss32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs32=_mm_add_ps(Vtmp1,Vs32);)                                   ENABLE_AVX_IMPLEMENTATION(Vs32=_mm256_add_ps(Vtmp1,Vs32);)
80 
81     ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Sa13.f*Sa13.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_mul_ps(Va13,Va13);)                                    ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_mul_ps(Va13,Va13);)
82     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa23.f*Sa23.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va23,Va23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va23,Va23);)
83     ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);)                                   ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
84     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa33.f*Sa33.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va33,Va33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va33,Va33);)
85     ENABLE_SCALAR_IMPLEMENTATION(Ss33.f=Stmp1.f+Ss33.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vs33=_mm_add_ps(Vtmp1,Vs33);)                                   ENABLE_AVX_IMPLEMENTATION(Vs33=_mm256_add_ps(Vtmp1,Vs33);)
86 
87     //###########################################################
88     // Solve symmetric eigenproblem using Jacobi iteration
89     //###########################################################
90 
91     for(int sweep=1;sweep<=4;sweep++){
92 
93         // First Jacobi conjugation
94 
95 #define SS11 Ss11
96 #define SS21 Ss21
97 #define SS31 Ss31
98 #define SS22 Ss22
99 #define SS32 Ss32
100 #define SS33 Ss33
101 #define SQVVX Sqvvx
102 #define SQVVY Sqvvy
103 #define SQVVZ Sqvvz
104 #define STMP1 Stmp1
105 #define STMP2 Stmp2
106 #define STMP3 Stmp3
107 
108 #define VS11 Vs11
109 #define VS21 Vs21
110 #define VS31 Vs31
111 #define VS22 Vs22
112 #define VS32 Vs32
113 #define VS33 Vs33
114 #define VQVVX Vqvvx
115 #define VQVVY Vqvvy
116 #define VQVVZ Vqvvz
117 #define VTMP1 Vtmp1
118 #define VTMP2 Vtmp2
119 #define VTMP3 Vtmp3
120 
121 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
122 
123 #undef SS11
124 #undef SS21
125 #undef SS31
126 #undef SS22
127 #undef SS32
128 #undef SS33
129 #undef SQVVX
130 #undef SQVVY
131 #undef SQVVZ
132 #undef STMP1
133 #undef STMP2
134 #undef STMP3
135 
136 #undef VS11
137 #undef VS21
138 #undef VS31
139 #undef VS22
140 #undef VS32
141 #undef VS33
142 #undef VQVVX
143 #undef VQVVY
144 #undef VQVVZ
145 #undef VTMP1
146 #undef VTMP2
147 #undef VTMP3
148 
149         // Second Jacobi conjugation
150 
151 #define SS11 Ss22
152 #define SS21 Ss32
153 #define SS31 Ss21
154 #define SS22 Ss33
155 #define SS32 Ss31
156 #define SS33 Ss11
157 #define SQVVX Sqvvy
158 #define SQVVY Sqvvz
159 #define SQVVZ Sqvvx
160 #define STMP1 Stmp2
161 #define STMP2 Stmp3
162 #define STMP3 Stmp1
163 
164 #define VS11 Vs22
165 #define VS21 Vs32
166 #define VS31 Vs21
167 #define VS22 Vs33
168 #define VS32 Vs31
169 #define VS33 Vs11
170 #define VQVVX Vqvvy
171 #define VQVVY Vqvvz
172 #define VQVVZ Vqvvx
173 #define VTMP1 Vtmp2
174 #define VTMP2 Vtmp3
175 #define VTMP3 Vtmp1
176 
177 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
178 
179 #undef SS11
180 #undef SS21
181 #undef SS31
182 #undef SS22
183 #undef SS32
184 #undef SS33
185 #undef SQVVX
186 #undef SQVVY
187 #undef SQVVZ
188 #undef STMP1
189 #undef STMP2
190 #undef STMP3
191 
192 #undef VS11
193 #undef VS21
194 #undef VS31
195 #undef VS22
196 #undef VS32
197 #undef VS33
198 #undef VQVVX
199 #undef VQVVY
200 #undef VQVVZ
201 #undef VTMP1
202 #undef VTMP2
203 #undef VTMP3
204 
205         // Third Jacobi conjugation
206 
207 #define SS11 Ss33
208 #define SS21 Ss31
209 #define SS31 Ss32
210 #define SS22 Ss11
211 #define SS32 Ss21
212 #define SS33 Ss22
213 #define SQVVX Sqvvz
214 #define SQVVY Sqvvx
215 #define SQVVZ Sqvvy
216 #define STMP1 Stmp3
217 #define STMP2 Stmp1
218 #define STMP3 Stmp2
219 
220 #define VS11 Vs33
221 #define VS21 Vs31
222 #define VS31 Vs32
223 #define VS22 Vs11
224 #define VS32 Vs21
225 #define VS33 Vs22
226 #define VQVVX Vqvvz
227 #define VQVVY Vqvvx
228 #define VQVVZ Vqvvy
229 #define VTMP1 Vtmp3
230 #define VTMP2 Vtmp1
231 #define VTMP3 Vtmp2
232 
233 #include "Singular_Value_Decomposition_Jacobi_Conjugation_Kernel.hpp"
234 
235 #undef SS11
236 #undef SS21
237 #undef SS31
238 #undef SS22
239 #undef SS32
240 #undef SS33
241 #undef SQVVX
242 #undef SQVVY
243 #undef SQVVZ
244 #undef STMP1
245 #undef STMP2
246 #undef STMP3
247 
248 #undef VS11
249 #undef VS21
250 #undef VS31
251 #undef VS22
252 #undef VS32
253 #undef VS33
254 #undef VQVVX
255 #undef VQVVY
256 #undef VQVVZ
257 #undef VTMP1
258 #undef VTMP2
259 #undef VTMP3
260     }
261 
262 #ifdef PRINT_DEBUGGING_OUTPUT
263 #ifdef USE_SCALAR_IMPLEMENTATION
264     std::cout<<"Scalar S ="<<std::endl;
265     std::cout<<std::setw(12)<<Ss11.f<<std::endl;
266     std::cout<<std::setw(12)<<Ss21.f<<"  "<<std::setw(12)<<Ss22.f<<std::endl;
267     std::cout<<std::setw(12)<<Ss31.f<<"  "<<std::setw(12)<<Ss32.f<<"  "<<std::setw(12)<<Ss33.f<<std::endl;
268 #endif
269 #ifdef USE_SSE_IMPLEMENTATION
270     _mm_storeu_ps(buf,Vs11);S11=buf[0];
271     _mm_storeu_ps(buf,Vs21);S21=buf[0];
272     _mm_storeu_ps(buf,Vs31);S31=buf[0];
273     _mm_storeu_ps(buf,Vs22);S22=buf[0];
274     _mm_storeu_ps(buf,Vs32);S32=buf[0];
275     _mm_storeu_ps(buf,Vs33);S33=buf[0];
276     std::cout<<"Vector S ="<<std::endl;
277     std::cout<<std::setw(12)<<S11<<std::endl;
278     std::cout<<std::setw(12)<<S21<<"  "<<std::setw(12)<<S22<<std::endl;
279     std::cout<<std::setw(12)<<S31<<"  "<<std::setw(12)<<S32<<"  "<<std::setw(12)<<S33<<std::endl;
280 #endif
281 #ifdef USE_AVX_IMPLEMENTATION
282     _mm256_storeu_ps(buf,Vs11);S11=buf[0];
283     _mm256_storeu_ps(buf,Vs21);S21=buf[0];
284     _mm256_storeu_ps(buf,Vs31);S31=buf[0];
285     _mm256_storeu_ps(buf,Vs22);S22=buf[0];
286     _mm256_storeu_ps(buf,Vs32);S32=buf[0];
287     _mm256_storeu_ps(buf,Vs33);S33=buf[0];
288     std::cout<<"Vector S ="<<std::endl;
289     std::cout<<std::setw(12)<<S11<<std::endl;
290     std::cout<<std::setw(12)<<S21<<"  "<<std::setw(12)<<S22<<std::endl;
291     std::cout<<std::setw(12)<<S31<<"  "<<std::setw(12)<<S32<<"  "<<std::setw(12)<<S33<<std::endl;
292 #endif
293 #endif
294 
295 } // End block : Symmetric eigenanalysis
296 
297     //###########################################################
298     // Normalize quaternion for matrix V
299     //###########################################################
300 
301 #if !defined(USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION) || defined(PERFORM_STRICT_QUATERNION_RENORMALIZATION)
302 
303     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
304     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
305     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
306     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
307     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
308     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
309     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
310 
311     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);)                                     ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
312     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
313     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
314     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
315     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
316     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
317     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
318 
319     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
320     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
321     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
322     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
323 
324 #ifdef PRINT_DEBUGGING_OUTPUT
325 #ifdef USE_SCALAR_IMPLEMENTATION
326     std::cout<<"Scalar qV ="<<std::endl;
327     std::cout<<std::setw(12)<<Sqvs.f<<"  "<<std::setw(12)<<Sqvvx.f<<"  "<<std::setw(12)<<Sqvvy.f<<"  "<<std::setw(12)<<Sqvvz.f<<std::endl;
328 #endif
329 #ifdef USE_SSE_IMPLEMENTATION
330     _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
331     _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
332     _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
333     _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
334     std::cout<<"Vector qV ="<<std::endl;
335     std::cout<<std::setw(12)<<QVS<<"  "<<std::setw(12)<<QVVX<<"  "<<std::setw(12)<<QVVY<<"  "<<std::setw(12)<<QVVZ<<std::endl;
336 #endif
337 #ifdef USE_AVX_IMPLEMENTATION
338     _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
339     _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
340     _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
341     _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
342     std::cout<<"Vector qV ="<<std::endl;
343     std::cout<<std::setw(12)<<QVS<<"  "<<std::setw(12)<<QVVX<<"  "<<std::setw(12)<<QVVY<<"  "<<std::setw(12)<<QVVZ<<std::endl;
344 #endif
345 #endif
346 
347 #endif
348 
349 { // Begin block : Conjugation with V
350 
351     //###########################################################
352     // Transform quaternion to matrix V
353     //###########################################################
354 
355 #ifndef COMPUTE_V_AS_MATRIX
356     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv11;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv11;)
357     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv21;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv21;)
358     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv31;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv31;)
359     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv12;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv12;)
360     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv22;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv22;)
361     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv32;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv32;)
362     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv13;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv13;)
363     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv23;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv23;)
364     ENABLE_SCALAR_IMPLEMENTATION(union {float f;unsigned int ui;} Sv33;)                  ENABLE_VECTOR_IMPLEMENTATION(__m128 Vv33;)
365 #endif
366 
367     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
368     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f*Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvy,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvy,Vqvvy);)
369     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f*Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvz,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvz,Vqvvz);)
370     ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sqvs.f*Sqvs.f;)                                   ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vqvs,Vqvs);)                                    ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vqvs,Vqvs);)
371     ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv11.f-Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv11,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv11,Vtmp1);)
372     ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv22.f-Stmp2.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_sub_ps(Vv22,Vtmp2);)                                   ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_sub_ps(Vv22,Vtmp2);)
373     ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f+Stmp3.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_add_ps(Vv33,Vtmp3);)                                   ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_add_ps(Vv33,Vtmp3);)
374     ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f+Stmp2.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_add_ps(Vv22,Vtmp2);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_add_ps(Vv22,Vtmp2);)
375     ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f-Stmp3.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_sub_ps(Vv22,Vtmp3);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_sub_ps(Vv22,Vtmp3);)
376     ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_add_ps(Vv11,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_add_ps(Vv11,Vtmp1);)
377     ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp2.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp2);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp2);)
378     ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f-Stmp3.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_sub_ps(Vv11,Vtmp3);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_sub_ps(Vv11,Vtmp3);)
379     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f+Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vqvvx,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vqvvx,Vqvvx);)
380     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvy.f+Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vqvvy,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vqvvy,Vqvvy);)
381     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvz.f+Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vqvvz,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vqvvz,Vqvvz);)
382     ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sqvs.f*Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vqvs,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vqvs,Vtmp1);)
383     ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sqvs.f*Stmp2.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vqvs,Vtmp2);)                                   ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vqvs,Vtmp2);)
384     ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sqvs.f*Stmp3.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vqvs,Vtmp3);)                                   ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vqvs,Vtmp3);)
385     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vtmp1);)
386     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvvz.f*Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvvz,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvvz,Vtmp2);)
387     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sqvvx.f*Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vqvvx,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vqvvx,Vtmp3);)
388     ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Stmp1.f-Sv21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_sub_ps(Vtmp1,Vv21);)                                   ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_sub_ps(Vtmp1,Vv21);)
389     ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Stmp2.f-Sv32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_sub_ps(Vtmp2,Vv32);)                                   ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_sub_ps(Vtmp2,Vv32);)
390     ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Stmp3.f-Sv13.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_sub_ps(Vtmp3,Vv13);)                                   ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_sub_ps(Vtmp3,Vv13);)
391     ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Stmp1.f+Sv21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_add_ps(Vtmp1,Vv21);)                                   ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_add_ps(Vtmp1,Vv21);)
392     ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Stmp2.f+Sv32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_add_ps(Vtmp2,Vv32);)                                   ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_add_ps(Vtmp2,Vv32);)
393     ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Stmp3.f+Sv13.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_add_ps(Vtmp3,Vv13);)                                   ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_add_ps(Vtmp3,Vv13);)
394 
395 #ifdef COMPUTE_V_AS_MATRIX
396 #ifdef PRINT_DEBUGGING_OUTPUT
397 #ifdef USE_SCALAR_IMPLEMENTATION
398     std::cout<<"Scalar V ="<<std::endl;
399     std::cout<<std::setw(12)<<Sv11.f<<"  "<<std::setw(12)<<Sv12.f<<"  "<<std::setw(12)<<Sv13.f<<std::endl;
400     std::cout<<std::setw(12)<<Sv21.f<<"  "<<std::setw(12)<<Sv22.f<<"  "<<std::setw(12)<<Sv23.f<<std::endl;
401     std::cout<<std::setw(12)<<Sv31.f<<"  "<<std::setw(12)<<Sv32.f<<"  "<<std::setw(12)<<Sv33.f<<std::endl;
402 #endif
403 #ifdef USE_SSE_IMPLEMENTATION
404     _mm_storeu_ps(buf,Vv11);V11=buf[0];
405     _mm_storeu_ps(buf,Vv21);V21=buf[0];
406     _mm_storeu_ps(buf,Vv31);V31=buf[0];
407     _mm_storeu_ps(buf,Vv12);V12=buf[0];
408     _mm_storeu_ps(buf,Vv22);V22=buf[0];
409     _mm_storeu_ps(buf,Vv32);V32=buf[0];
410     _mm_storeu_ps(buf,Vv13);V13=buf[0];
411     _mm_storeu_ps(buf,Vv23);V23=buf[0];
412     _mm_storeu_ps(buf,Vv33);V33=buf[0];
413     std::cout<<"Vector V ="<<std::endl;
414     std::cout<<std::setw(12)<<V11<<"  "<<std::setw(12)<<V12<<"  "<<std::setw(12)<<V13<<std::endl;
415     std::cout<<std::setw(12)<<V21<<"  "<<std::setw(12)<<V22<<"  "<<std::setw(12)<<V23<<std::endl;
416     std::cout<<std::setw(12)<<V31<<"  "<<std::setw(12)<<V32<<"  "<<std::setw(12)<<V33<<std::endl;
417 #endif
418 #ifdef USE_AVX_IMPLEMENTATION
419     _mm256_storeu_ps(buf,Vv11);V11=buf[0];
420     _mm256_storeu_ps(buf,Vv21);V21=buf[0];
421     _mm256_storeu_ps(buf,Vv31);V31=buf[0];
422     _mm256_storeu_ps(buf,Vv12);V12=buf[0];
423     _mm256_storeu_ps(buf,Vv22);V22=buf[0];
424     _mm256_storeu_ps(buf,Vv32);V32=buf[0];
425     _mm256_storeu_ps(buf,Vv13);V13=buf[0];
426     _mm256_storeu_ps(buf,Vv23);V23=buf[0];
427     _mm256_storeu_ps(buf,Vv33);V33=buf[0];
428     std::cout<<"Vector V ="<<std::endl;
429     std::cout<<std::setw(12)<<V11<<"  "<<std::setw(12)<<V12<<"  "<<std::setw(12)<<V13<<std::endl;
430     std::cout<<std::setw(12)<<V21<<"  "<<std::setw(12)<<V22<<"  "<<std::setw(12)<<V23<<std::endl;
431     std::cout<<std::setw(12)<<V31<<"  "<<std::setw(12)<<V32<<"  "<<std::setw(12)<<V33<<std::endl;
432 #endif
433 #endif
434 #endif
435 
436     //###########################################################
437     // Multiply (from the right) with V
438     //###########################################################
439 
440     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va12;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va12;)
441     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va13;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va13;)
442     ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sv12.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Vv12,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Vv12,Va11);)
443     ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sv13.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Vv13,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Vv13,Va11);)
444     ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sv11.f*Sa11.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Vv11,Va11);)                                    ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Vv11,Va11);)
445     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
446     ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
447     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
448     ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va11=_mm_add_ps(Va11,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_add_ps(Va11,Vtmp1);)
449     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
450     ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
451     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
452     ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va12=_mm_add_ps(Va12,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_add_ps(Va12,Vtmp1);)
453     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
454     ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
455     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
456     ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va13=_mm_add_ps(Va13,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_add_ps(Va13,Vtmp1);)
457 
458     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa22.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va22;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va22;)
459     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa23.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va23;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va23;)
460     ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sv12.f*Sa21.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Vv12,Va21);)                                    ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Vv12,Va21);)
461     ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sv13.f*Sa21.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Vv13,Va21);)                                    ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Vv13,Va21);)
462     ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sv11.f*Sa21.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Vv11,Va21);)                                    ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Vv11,Va21);)
463     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
464     ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
465     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
466     ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va21=_mm_add_ps(Va21,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_add_ps(Va21,Vtmp1);)
467     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
468     ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
469     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
470     ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va22=_mm_add_ps(Va22,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_add_ps(Va22,Vtmp1);)
471     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
472     ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
473     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
474     ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va23=_mm_add_ps(Va23,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_add_ps(Va23,Vtmp1);)
475 
476     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa32.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp2=Va32;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp2=Va32;)
477     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa33.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vtmp3=Va33;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp3=Va33;)
478     ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sv12.f*Sa31.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Vv12,Va31);)                                    ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Vv12,Va31);)
479     ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sv13.f*Sa31.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Vv13,Va31);)                                    ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Vv13,Va31);)
480     ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sv11.f*Sa31.f;)                                   ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Vv11,Va31);)                                    ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Vv11,Va31);)
481     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv21.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv21,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv21,Vtmp2);)
482     ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
483     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv31.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv31,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv31,Vtmp3);)
484     ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va31=_mm_add_ps(Va31,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_add_ps(Va31,Vtmp1);)
485     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv22.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv22,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv22,Vtmp2);)
486     ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
487     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv32.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv32,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv32,Vtmp3);)
488     ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va32=_mm_add_ps(Va32,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_add_ps(Va32,Vtmp1);)
489     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv23.f*Stmp2.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv23,Vtmp2);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv23,Vtmp2);)
490     ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
491     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sv33.f*Stmp3.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vv33,Vtmp3);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vv33,Vtmp3);)
492     ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f+Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va33=_mm_add_ps(Va33,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_add_ps(Va33,Vtmp1);)
493 
494 #ifdef PRINT_DEBUGGING_OUTPUT
495 #ifdef USE_SCALAR_IMPLEMENTATION
496     std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
497     std::cout<<std::setw(12)<<Sa11.f<<"  "<<std::setw(12)<<Sa12.f<<"  "<<std::setw(12)<<Sa13.f<<std::endl;
498     std::cout<<std::setw(12)<<Sa21.f<<"  "<<std::setw(12)<<Sa22.f<<"  "<<std::setw(12)<<Sa23.f<<std::endl;
499     std::cout<<std::setw(12)<<Sa31.f<<"  "<<std::setw(12)<<Sa32.f<<"  "<<std::setw(12)<<Sa33.f<<std::endl;
500 #endif
501 #ifdef USE_SSE_IMPLEMENTATION
502     _mm_storeu_ps(buf,Va11);A11=buf[0];
503     _mm_storeu_ps(buf,Va21);A21=buf[0];
504     _mm_storeu_ps(buf,Va31);A31=buf[0];
505     _mm_storeu_ps(buf,Va12);A12=buf[0];
506     _mm_storeu_ps(buf,Va22);A22=buf[0];
507     _mm_storeu_ps(buf,Va32);A32=buf[0];
508     _mm_storeu_ps(buf,Va13);A13=buf[0];
509     _mm_storeu_ps(buf,Va23);A23=buf[0];
510     _mm_storeu_ps(buf,Va33);A33=buf[0];
511     std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
512     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
513     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
514     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
515 #endif
516 #ifdef USE_AVX_IMPLEMENTATION
517     _mm256_storeu_ps(buf,Va11);A11=buf[0];
518     _mm256_storeu_ps(buf,Va21);A21=buf[0];
519     _mm256_storeu_ps(buf,Va31);A31=buf[0];
520     _mm256_storeu_ps(buf,Va12);A12=buf[0];
521     _mm256_storeu_ps(buf,Va22);A22=buf[0];
522     _mm256_storeu_ps(buf,Va32);A32=buf[0];
523     _mm256_storeu_ps(buf,Va13);A13=buf[0];
524     _mm256_storeu_ps(buf,Va23);A23=buf[0];
525     _mm256_storeu_ps(buf,Va33);A33=buf[0];
526     std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
527     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
528     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
529     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
530 #endif
531 #endif
532 
533 } // End block : Conjugation with V
534 
535 } // End block : Scope of qV (if not maintained)
536 
537     //###########################################################
538     // Permute columns such that the singular values are sorted
539     //###########################################################
540 
541     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sa11.f*Sa11.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Va11,Va11);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Va11,Va11);)
542     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa21.f*Sa21.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va21,Va21);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va21,Va21);)
543     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
544     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa31.f*Sa31.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va31,Va31);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va31,Va31);)
545     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
546 
547     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sa12.f*Sa12.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Va12,Va12);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Va12,Va12);)
548     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa22.f*Sa22.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va22,Va22);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va22,Va22);)
549     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
550     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa32.f*Sa32.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va32,Va32);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va32,Va32);)
551     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp2,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp2,Vtmp4);)
552 
553     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Sa13.f*Sa13.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Va13,Va13);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Va13,Va13);)
554     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa23.f*Sa23.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va23,Va23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va23,Va23);)
555     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
556     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sa33.f*Sa33.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Va33,Va33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Va33,Va33);)
557     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp3.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp3,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp3,Vtmp4);)
558 
559     // Swap columns 1-2 if necessary
560 
561     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp2.f)?0xffffffff:0;)                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp2);)                               ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp2, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp2);)
562     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa12.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va12);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va12);)
563     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
564     ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
565     ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
566 
567     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa22.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va22);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va22);)
568     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
569     ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
570     ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
571 
572     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa32.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va32);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va32);)
573     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
574     ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
575     ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
576 
577 #ifdef COMPUTE_V_AS_MATRIX
578     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv12.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv12);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv12);)
579     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
580     ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
581     ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
582 
583     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv22.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv22);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv22);)
584     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
585     ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
586     ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
587 
588     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv32.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv32);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv32);)
589     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
590     ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
591     ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
592 #endif
593 
594     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp2.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp2);)
595     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
596     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
597     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
598 
599     // If columns 1-2 have been swapped, negate 2nd column of A and V so that V is still a rotation
600 
601     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;)                                            ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);)                                        ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
602     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
603     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;)                                             ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
604     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
605 
606     ENABLE_SCALAR_IMPLEMENTATION(Sa12.f=Sa12.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va12=_mm_mul_ps(Va12,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_mul_ps(Va12,Vtmp4);)
607     ENABLE_SCALAR_IMPLEMENTATION(Sa22.f=Sa22.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va22=_mm_mul_ps(Va22,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_mul_ps(Va22,Vtmp4);)
608     ENABLE_SCALAR_IMPLEMENTATION(Sa32.f=Sa32.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va32=_mm_mul_ps(Va32,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_mul_ps(Va32,Vtmp4);)
609 
610 #ifdef COMPUTE_V_AS_MATRIX
611     ENABLE_SCALAR_IMPLEMENTATION(Sv12.f=Sv12.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_mul_ps(Vv12,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_mul_ps(Vv12,Vtmp4);)
612     ENABLE_SCALAR_IMPLEMENTATION(Sv22.f=Sv22.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_mul_ps(Vv22,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_mul_ps(Vv22,Vtmp4);)
613     ENABLE_SCALAR_IMPLEMENTATION(Sv32.f=Sv32.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_mul_ps(Vv32,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_mul_ps(Vv32,Vtmp4);)
614 #endif
615 
616     // If columns 1-2 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
617 
618 #ifdef COMPUTE_V_AS_QUATERNION
619     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
620     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
621 
622     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
623     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
624     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
625     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvs);)
626     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;)                                                    ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
627 
628     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
629     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvy);)
630     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp4);)
631     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvvy);)
632     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Stmp5.f;)                                        ENABLE_SSE_IMPLEMENTATION(Vqvvy=Vtmp5;)                                                   ENABLE_AVX_IMPLEMENTATION(Vqvvy=Vtmp5;)
633 #endif
634 
635     // Swap columns 1-3 if necessary
636 
637     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp1.f<Stmp3.f)?0xffffffff:0;)                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp1,Vtmp3);)                               ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp1,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp1,Vtmp3);)
638     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa11.ui^Sa13.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va11,Va13);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va11,Va13);)
639     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
640     ENABLE_SCALAR_IMPLEMENTATION(Sa11.ui=Sa11.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va11=_mm_xor_ps(Va11,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_xor_ps(Va11,Vtmp5);)
641     ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
642 
643     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa21.ui^Sa23.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va21,Va23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va21,Va23);)
644     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
645     ENABLE_SCALAR_IMPLEMENTATION(Sa21.ui=Sa21.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va21=_mm_xor_ps(Va21,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_xor_ps(Va21,Vtmp5);)
646     ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
647 
648     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa31.ui^Sa33.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va31,Va33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va31,Va33);)
649     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
650     ENABLE_SCALAR_IMPLEMENTATION(Sa31.ui=Sa31.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va31=_mm_xor_ps(Va31,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_xor_ps(Va31,Vtmp5);)
651     ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
652 
653 #ifdef COMPUTE_V_AS_MATRIX
654     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv11.ui^Sv13.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv11,Vv13);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv11,Vv13);)
655     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
656     ENABLE_SCALAR_IMPLEMENTATION(Sv11.ui=Sv11.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_xor_ps(Vv11,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_xor_ps(Vv11,Vtmp5);)
657     ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
658 
659     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv21.ui^Sv23.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv21,Vv23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv21,Vv23);)
660     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
661     ENABLE_SCALAR_IMPLEMENTATION(Sv21.ui=Sv21.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_xor_ps(Vv21,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_xor_ps(Vv21,Vtmp5);)
662     ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
663 
664     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv31.ui^Sv33.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv31,Vv33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv31,Vv33);)
665     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
666     ENABLE_SCALAR_IMPLEMENTATION(Sv31.ui=Sv31.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_xor_ps(Vv31,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_xor_ps(Vv31,Vtmp5);)
667     ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
668 #endif
669 
670     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp1.ui^Stmp3.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp1,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp1,Vtmp3);)
671     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
672     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=Stmp1.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_xor_ps(Vtmp1,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_xor_ps(Vtmp1,Vtmp5);)
673     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
674 
675     // If columns 1-3 have been swapped, negate 1st column of A and V so that V is still a rotation
676 
677     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;)                                            ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);)                                        ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
678     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
679     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;)                                             ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
680     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
681 
682     ENABLE_SCALAR_IMPLEMENTATION(Sa11.f=Sa11.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va11=_mm_mul_ps(Va11,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va11=_mm256_mul_ps(Va11,Vtmp4);)
683     ENABLE_SCALAR_IMPLEMENTATION(Sa21.f=Sa21.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va21=_mm_mul_ps(Va21,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va21=_mm256_mul_ps(Va21,Vtmp4);)
684     ENABLE_SCALAR_IMPLEMENTATION(Sa31.f=Sa31.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va31=_mm_mul_ps(Va31,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va31=_mm256_mul_ps(Va31,Vtmp4);)
685 
686 #ifdef COMPUTE_V_AS_MATRIX
687     ENABLE_SCALAR_IMPLEMENTATION(Sv11.f=Sv11.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv11=_mm_mul_ps(Vv11,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv11=_mm256_mul_ps(Vv11,Vtmp4);)
688     ENABLE_SCALAR_IMPLEMENTATION(Sv21.f=Sv21.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv21=_mm_mul_ps(Vv21,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv21=_mm256_mul_ps(Vv21,Vtmp4);)
689     ENABLE_SCALAR_IMPLEMENTATION(Sv31.f=Sv31.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv31=_mm_mul_ps(Vv31,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv31=_mm256_mul_ps(Vv31,Vtmp4);)
690 #endif
691 
692     // If columns 1-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
693 
694 #ifdef COMPUTE_V_AS_QUATERNION
695     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
696     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
697 
698     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
699     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
700     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
701     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvs);)
702     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;)                                                    ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
703 
704     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvz);)
705     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvx);)
706     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp4);)
707     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f-Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_sub_ps(Vqvvz,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_sub_ps(Vqvvz,Vqvvx);)
708     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Stmp5.f;)                                        ENABLE_SSE_IMPLEMENTATION(Vqvvx=Vtmp5;)                                                   ENABLE_AVX_IMPLEMENTATION(Vqvvx=Vtmp5;)
709 #endif
710 
711     // Swap columns 2-3 if necessary
712 
713     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.ui=(Stmp2.f<Stmp3.f)?0xffffffff:0;)                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_cmplt_ps(Vtmp2,Vtmp3);)                               ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmp_ps(Vtmp2,Vtmp3, _CMP_LT_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_cmplt_ps(Vtmp2,Vtmp3);)
714     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa12.ui^Sa13.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va12,Va13);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va12,Va13);)
715     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
716     ENABLE_SCALAR_IMPLEMENTATION(Sa12.ui=Sa12.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va12=_mm_xor_ps(Va12,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va12=_mm256_xor_ps(Va12,Vtmp5);)
717     ENABLE_SCALAR_IMPLEMENTATION(Sa13.ui=Sa13.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va13=_mm_xor_ps(Va13,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_xor_ps(Va13,Vtmp5);)
718 
719     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa22.ui^Sa23.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va22,Va23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va22,Va23);)
720     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
721     ENABLE_SCALAR_IMPLEMENTATION(Sa22.ui=Sa22.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va22=_mm_xor_ps(Va22,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va22=_mm256_xor_ps(Va22,Vtmp5);)
722     ENABLE_SCALAR_IMPLEMENTATION(Sa23.ui=Sa23.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va23=_mm_xor_ps(Va23,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_xor_ps(Va23,Vtmp5);)
723 
724     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sa32.ui^Sa33.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Va32,Va33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Va32,Va33);)
725     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
726     ENABLE_SCALAR_IMPLEMENTATION(Sa32.ui=Sa32.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va32=_mm_xor_ps(Va32,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va32=_mm256_xor_ps(Va32,Vtmp5);)
727     ENABLE_SCALAR_IMPLEMENTATION(Sa33.ui=Sa33.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Va33=_mm_xor_ps(Va33,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_xor_ps(Va33,Vtmp5);)
728 
729 #ifdef COMPUTE_V_AS_MATRIX
730     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv12.ui^Sv13.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv12,Vv13);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv12,Vv13);)
731     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
732     ENABLE_SCALAR_IMPLEMENTATION(Sv12.ui=Sv12.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv12=_mm_xor_ps(Vv12,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv12=_mm256_xor_ps(Vv12,Vtmp5);)
733     ENABLE_SCALAR_IMPLEMENTATION(Sv13.ui=Sv13.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_xor_ps(Vv13,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_xor_ps(Vv13,Vtmp5);)
734 
735     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv22.ui^Sv23.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv22,Vv23);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv22,Vv23);)
736     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
737     ENABLE_SCALAR_IMPLEMENTATION(Sv22.ui=Sv22.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv22=_mm_xor_ps(Vv22,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv22=_mm256_xor_ps(Vv22,Vtmp5);)
738     ENABLE_SCALAR_IMPLEMENTATION(Sv23.ui=Sv23.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_xor_ps(Vv23,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_xor_ps(Vv23,Vtmp5);)
739 
740     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Sv32.ui^Sv33.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vv32,Vv33);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vv32,Vv33);)
741     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
742     ENABLE_SCALAR_IMPLEMENTATION(Sv32.ui=Sv32.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv32=_mm_xor_ps(Vv32,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv32=_mm256_xor_ps(Vv32,Vtmp5);)
743     ENABLE_SCALAR_IMPLEMENTATION(Sv33.ui=Sv33.ui^Stmp5.ui;)                               ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_xor_ps(Vv33,Vtmp5);)                                   ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_xor_ps(Vv33,Vtmp5);)
744 #endif
745 
746     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp2.ui^Stmp3.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_xor_ps(Vtmp2,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_xor_ps(Vtmp2,Vtmp3);)
747     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
748     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Stmp2.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_xor_ps(Vtmp2,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_xor_ps(Vtmp2,Vtmp5);)
749     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.ui=Stmp3.ui^Stmp5.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_xor_ps(Vtmp3,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_xor_ps(Vtmp3,Vtmp5);)
750 
751     // If columns 2-3 have been swapped, negate 3rd column of A and V so that V is still a rotation
752 
753     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=-2.;)                                            ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_set1_ps(-2.);)                                        ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_set1_ps(-2.);)
754     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.ui=Stmp5.ui&Stmp4.ui;)                             ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_and_ps(Vtmp5,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_and_ps(Vtmp5,Vtmp4);)
755     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=1.;)                                             ENABLE_SSE_IMPLEMENTATION(Vtmp4=Vone;)                                                    ENABLE_AVX_IMPLEMENTATION(Vtmp4=Vone;)
756     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Stmp5.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vtmp5);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vtmp5);)
757 
758     ENABLE_SCALAR_IMPLEMENTATION(Sa13.f=Sa13.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va13=_mm_mul_ps(Va13,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va13=_mm256_mul_ps(Va13,Vtmp4);)
759     ENABLE_SCALAR_IMPLEMENTATION(Sa23.f=Sa23.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va23=_mm_mul_ps(Va23,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va23=_mm256_mul_ps(Va23,Vtmp4);)
760     ENABLE_SCALAR_IMPLEMENTATION(Sa33.f=Sa33.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Va33=_mm_mul_ps(Va33,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Va33=_mm256_mul_ps(Va33,Vtmp4);)
761 
762 #ifdef COMPUTE_V_AS_MATRIX
763     ENABLE_SCALAR_IMPLEMENTATION(Sv13.f=Sv13.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv13=_mm_mul_ps(Vv13,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv13=_mm256_mul_ps(Vv13,Vtmp4);)
764     ENABLE_SCALAR_IMPLEMENTATION(Sv23.f=Sv23.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv23=_mm_mul_ps(Vv23,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv23=_mm256_mul_ps(Vv23,Vtmp4);)
765     ENABLE_SCALAR_IMPLEMENTATION(Sv33.f=Sv33.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vv33=_mm_mul_ps(Vv33,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vv33=_mm256_mul_ps(Vv33,Vtmp4);)
766 #endif
767 
768     // If columns 2-3 have been swapped, also update quaternion representation of V (the quaternion may become un-normalized after this)
769 
770 #ifdef COMPUTE_V_AS_QUATERNION
771     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f*Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp4,Vone_half);)
772     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vone_half);)
773 
774     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvx);)
775     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvs);)
776     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp4.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp4);)                                   ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp4);)
777     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f-Sqvs.f;)                                 ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_sub_ps(Vqvvx,Vqvs);)                                  ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_sub_ps(Vqvvx,Vqvs);)
778     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Stmp5.f;)                                         ENABLE_SSE_IMPLEMENTATION(Vqvs=Vtmp5;)                                                    ENABLE_AVX_IMPLEMENTATION(Vqvs=Vtmp5;)
779 
780     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp4.f*Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp4,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp4,Vqvvy);)
781     ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f+Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_add_ps(Vtmp5,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_add_ps(Vtmp5,Vqvvz);)
782     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp4);)
783     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f-Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_sub_ps(Vqvvy,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_sub_ps(Vqvvy,Vqvvz);)
784     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Stmp5.f;)                                        ENABLE_SSE_IMPLEMENTATION(Vqvvz=Vtmp5;)                                                   ENABLE_AVX_IMPLEMENTATION(Vqvvz=Vtmp5;)
785 #endif
786 
787 #ifdef COMPUTE_V_AS_MATRIX
788 #ifdef PRINT_DEBUGGING_OUTPUT
789 #ifdef USE_SCALAR_IMPLEMENTATION
790     std::cout<<"Scalar V ="<<std::endl;
791     std::cout<<std::setw(12)<<Sv11.f<<"  "<<std::setw(12)<<Sv12.f<<"  "<<std::setw(12)<<Sv13.f<<std::endl;
792     std::cout<<std::setw(12)<<Sv21.f<<"  "<<std::setw(12)<<Sv22.f<<"  "<<std::setw(12)<<Sv23.f<<std::endl;
793     std::cout<<std::setw(12)<<Sv31.f<<"  "<<std::setw(12)<<Sv32.f<<"  "<<std::setw(12)<<Sv33.f<<std::endl;
794 #endif
795 #ifdef USE_SSE_IMPLEMENTATION
796     _mm_storeu_ps(buf,Vv11);V11=buf[0];
797     _mm_storeu_ps(buf,Vv21);V21=buf[0];
798     _mm_storeu_ps(buf,Vv31);V31=buf[0];
799     _mm_storeu_ps(buf,Vv12);V12=buf[0];
800     _mm_storeu_ps(buf,Vv22);V22=buf[0];
801     _mm_storeu_ps(buf,Vv32);V32=buf[0];
802     _mm_storeu_ps(buf,Vv13);V13=buf[0];
803     _mm_storeu_ps(buf,Vv23);V23=buf[0];
804     _mm_storeu_ps(buf,Vv33);V33=buf[0];
805     std::cout<<"Vector V ="<<std::endl;
806     std::cout<<std::setw(12)<<V11<<"  "<<std::setw(12)<<V12<<"  "<<std::setw(12)<<V13<<std::endl;
807     std::cout<<std::setw(12)<<V21<<"  "<<std::setw(12)<<V22<<"  "<<std::setw(12)<<V23<<std::endl;
808     std::cout<<std::setw(12)<<V31<<"  "<<std::setw(12)<<V32<<"  "<<std::setw(12)<<V33<<std::endl;
809 #endif
810 #ifdef USE_AVX_IMPLEMENTATION
811     _mm256_storeu_ps(buf,Vv11);V11=buf[0];
812     _mm256_storeu_ps(buf,Vv21);V21=buf[0];
813     _mm256_storeu_ps(buf,Vv31);V31=buf[0];
814     _mm256_storeu_ps(buf,Vv12);V12=buf[0];
815     _mm256_storeu_ps(buf,Vv22);V22=buf[0];
816     _mm256_storeu_ps(buf,Vv32);V32=buf[0];
817     _mm256_storeu_ps(buf,Vv13);V13=buf[0];
818     _mm256_storeu_ps(buf,Vv23);V23=buf[0];
819     _mm256_storeu_ps(buf,Vv33);V33=buf[0];
820     std::cout<<"Vector V ="<<std::endl;
821     std::cout<<std::setw(12)<<V11<<"  "<<std::setw(12)<<V12<<"  "<<std::setw(12)<<V13<<std::endl;
822     std::cout<<std::setw(12)<<V21<<"  "<<std::setw(12)<<V22<<"  "<<std::setw(12)<<V23<<std::endl;
823     std::cout<<std::setw(12)<<V31<<"  "<<std::setw(12)<<V32<<"  "<<std::setw(12)<<V33<<std::endl;
824 #endif
825 #endif
826 #endif
827 
828 #ifdef PRINT_DEBUGGING_OUTPUT
829 #ifdef USE_SCALAR_IMPLEMENTATION
830     std::cout<<"Scalar A (after multiplying with V) ="<<std::endl;
831     std::cout<<std::setw(12)<<Sa11.f<<"  "<<std::setw(12)<<Sa12.f<<"  "<<std::setw(12)<<Sa13.f<<std::endl;
832     std::cout<<std::setw(12)<<Sa21.f<<"  "<<std::setw(12)<<Sa22.f<<"  "<<std::setw(12)<<Sa23.f<<std::endl;
833     std::cout<<std::setw(12)<<Sa31.f<<"  "<<std::setw(12)<<Sa32.f<<"  "<<std::setw(12)<<Sa33.f<<std::endl;
834 #endif
835 #ifdef USE_SSE_IMPLEMENTATION
836     _mm_storeu_ps(buf,Va11);A11=buf[0];
837     _mm_storeu_ps(buf,Va21);A21=buf[0];
838     _mm_storeu_ps(buf,Va31);A31=buf[0];
839     _mm_storeu_ps(buf,Va12);A12=buf[0];
840     _mm_storeu_ps(buf,Va22);A22=buf[0];
841     _mm_storeu_ps(buf,Va32);A32=buf[0];
842     _mm_storeu_ps(buf,Va13);A13=buf[0];
843     _mm_storeu_ps(buf,Va23);A23=buf[0];
844     _mm_storeu_ps(buf,Va33);A33=buf[0];
845     std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
846     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
847     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
848     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
849 #endif
850 #ifdef USE_AVX_IMPLEMENTATION
851     _mm256_storeu_ps(buf,Va11);A11=buf[0];
852     _mm256_storeu_ps(buf,Va21);A21=buf[0];
853     _mm256_storeu_ps(buf,Va31);A31=buf[0];
854     _mm256_storeu_ps(buf,Va12);A12=buf[0];
855     _mm256_storeu_ps(buf,Va22);A22=buf[0];
856     _mm256_storeu_ps(buf,Va32);A32=buf[0];
857     _mm256_storeu_ps(buf,Va13);A13=buf[0];
858     _mm256_storeu_ps(buf,Va23);A23=buf[0];
859     _mm256_storeu_ps(buf,Va33);A33=buf[0];
860     std::cout<<"Vector A (after multiplying with V) ="<<std::endl;
861     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
862     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
863     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
864 #endif
865 #endif
866 
867     //###########################################################
868     // Re-normalize quaternion for matrix V
869     //###########################################################
870 
871 #ifdef COMPUTE_V_AS_QUATERNION
872     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sqvs.f*Sqvs.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vqvs,Vqvs);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vqvs,Vqvs);)
873     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvx.f*Sqvvx.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvx,Vqvvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvx,Vqvvx);)
874     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
875     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvy.f*Sqvvy.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvy,Vqvvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvy,Vqvvy);)
876     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
877     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sqvvz.f*Sqvvz.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vqvvz,Vqvvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vqvvz,Vqvvz);)
878     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp1.f+Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(Vtmp1,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(Vtmp1,Vtmp2);)
879     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=rsqrt(Stmp2.f);)                                 ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_rsqrt_ps(Vtmp2);)                                     ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_rsqrt_ps(Vtmp2);)
880 
881 #ifdef PERFORM_STRICT_QUATERNION_RENORMALIZATION
882     ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp1.f*Sone_half.f;)                            ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vtmp1,Vone_half);)                             ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vtmp1,Vone_half);)
883     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp4);)
884     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f*Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp1,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp1,Vtmp3);)
885     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp2.f*Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vtmp2,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vtmp2,Vtmp3);)
886     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f+Stmp4.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_add_ps(Vtmp1,Vtmp4);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_add_ps(Vtmp1,Vtmp4);)
887     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Stmp1.f-Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_sub_ps(Vtmp1,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_sub_ps(Vtmp1,Vtmp3);)
888 #endif
889 
890     ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f*Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vqvs,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vqvs,Vtmp1);)
891     ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sqvvx.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vqvvx,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vqvvx,Vtmp1);)
892     ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sqvvy.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vqvvy,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vqvvy,Vtmp1);)
893     ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sqvvz.f*Stmp1.f;)                                ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vqvvz,Vtmp1);)                                 ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vqvvz,Vtmp1);)
894 
895 #ifdef PRINT_DEBUGGING_OUTPUT
896 #ifdef USE_SCALAR_IMPLEMENTATION
897     std::cout<<"Scalar qV ="<<std::endl;
898     std::cout<<std::setw(12)<<Sqvs.f<<"  "<<std::setw(12)<<Sqvvx.f<<"  "<<std::setw(12)<<Sqvvy.f<<"  "<<std::setw(12)<<Sqvvz.f<<std::endl;
899 #endif
900 #ifdef USE_SSE_IMPLEMENTATION
901     _mm_storeu_ps(buf,Vqvs);QVS=buf[0];
902     _mm_storeu_ps(buf,Vqvvx);QVVX=buf[0];
903     _mm_storeu_ps(buf,Vqvvy);QVVY=buf[0];
904     _mm_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
905     std::cout<<"Vector qV ="<<std::endl;
906     std::cout<<std::setw(12)<<QVS<<"  "<<std::setw(12)<<QVVX<<"  "<<std::setw(12)<<QVVY<<"  "<<std::setw(12)<<QVVZ<<std::endl;
907 #endif
908 #ifdef USE_AVX_IMPLEMENTATION
909     _mm256_storeu_ps(buf,Vqvs);QVS=buf[0];
910     _mm256_storeu_ps(buf,Vqvvx);QVVX=buf[0];
911     _mm256_storeu_ps(buf,Vqvvy);QVVY=buf[0];
912     _mm256_storeu_ps(buf,Vqvvz);QVVZ=buf[0];
913     std::cout<<"Vector qV ="<<std::endl;
914     std::cout<<std::setw(12)<<QVS<<"  "<<std::setw(12)<<QVVX<<"  "<<std::setw(12)<<QVVY<<"  "<<std::setw(12)<<QVVZ<<std::endl;
915 #endif
916 #endif
917 #endif
918 
919     //###########################################################
920     // Construct QR factorization of A*V (=U*D) using Givens rotations
921     //###########################################################
922 
923 #ifdef COMPUTE_U_AS_MATRIX
924     ENABLE_SCALAR_IMPLEMENTATION(Su11.f=1.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu11=Vone;)                                                     ENABLE_AVX_IMPLEMENTATION(Vu11=Vone;)
925     ENABLE_SCALAR_IMPLEMENTATION(Su21.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu21=_mm_xor_ps(Vu21,Vu21);)                                    ENABLE_AVX_IMPLEMENTATION(Vu21=_mm256_xor_ps(Vu21,Vu21);)
926     ENABLE_SCALAR_IMPLEMENTATION(Su31.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu31=_mm_xor_ps(Vu31,Vu31);)                                    ENABLE_AVX_IMPLEMENTATION(Vu31=_mm256_xor_ps(Vu31,Vu31);)
927     ENABLE_SCALAR_IMPLEMENTATION(Su12.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu12=_mm_xor_ps(Vu12,Vu12);)                                    ENABLE_AVX_IMPLEMENTATION(Vu12=_mm256_xor_ps(Vu12,Vu12);)
928     ENABLE_SCALAR_IMPLEMENTATION(Su22.f=1.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu22=Vone;)                                                     ENABLE_AVX_IMPLEMENTATION(Vu22=Vone;)
929     ENABLE_SCALAR_IMPLEMENTATION(Su32.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu32=_mm_xor_ps(Vu32,Vu32);)                                    ENABLE_AVX_IMPLEMENTATION(Vu32=_mm256_xor_ps(Vu32,Vu32);)
930     ENABLE_SCALAR_IMPLEMENTATION(Su13.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu13=_mm_xor_ps(Vu13,Vu13);)                                    ENABLE_AVX_IMPLEMENTATION(Vu13=_mm256_xor_ps(Vu13,Vu13);)
931     ENABLE_SCALAR_IMPLEMENTATION(Su23.f=0.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu23=_mm_xor_ps(Vu23,Vu23);)                                    ENABLE_AVX_IMPLEMENTATION(Vu23=_mm256_xor_ps(Vu23,Vu23);)
932     ENABLE_SCALAR_IMPLEMENTATION(Su33.f=1.;)                                              ENABLE_SSE_IMPLEMENTATION(Vu33=Vone;)                                                     ENABLE_AVX_IMPLEMENTATION(Vu33=Vone;)
933 #endif
934 
935 #ifdef COMPUTE_U_AS_QUATERNION
936     ENABLE_SCALAR_IMPLEMENTATION(Squs.f=1.;)                                              ENABLE_SSE_IMPLEMENTATION(Vqus=Vone;)                                                     ENABLE_AVX_IMPLEMENTATION(Vqus=Vone;)
937     ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
938     ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
939     ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_xor_ps(Vquvz,Vquvz);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_xor_ps(Vquvz,Vquvz);)
940 #endif
941 
942     // First Givens rotation
943 
944 #define SAPIVOT Sa11
945 #define SANPIVOT Sa21
946 #define SA11 Sa11
947 #define SA21 Sa21
948 #define SA12 Sa12
949 #define SA22 Sa22
950 #define SA13 Sa13
951 #define SA23 Sa23
952 #define SU11 Su11
953 #define SU12 Su12
954 #define SU21 Su21
955 #define SU22 Su22
956 #define SU31 Su31
957 #define SU32 Su32
958 
959 #define VAPIVOT Va11
960 #define VANPIVOT Va21
961 #define VA11 Va11
962 #define VA21 Va21
963 #define VA12 Va12
964 #define VA22 Va22
965 #define VA13 Va13
966 #define VA23 Va23
967 #define VU11 Vu11
968 #define VU12 Vu12
969 #define VU21 Vu21
970 #define VU22 Vu22
971 #define VU31 Vu31
972 #define VU32 Vu32
973 
974 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
975 
976 #undef SAPIVOT
977 #undef SANPIVOT
978 #undef SA11
979 #undef SA21
980 #undef SA12
981 #undef SA22
982 #undef SA13
983 #undef SA23
984 #undef SU11
985 #undef SU12
986 #undef SU21
987 #undef SU22
988 #undef SU31
989 #undef SU32
990 
991 #undef VAPIVOT
992 #undef VANPIVOT
993 #undef VA11
994 #undef VA21
995 #undef VA12
996 #undef VA22
997 #undef VA13
998 #undef VA23
999 #undef VU11
1000 #undef VU12
1001 #undef VU21
1002 #undef VU22
1003 #undef VU31
1004 #undef VU32
1005 
1006     // Update quaternion representation of U
1007 
1008 #ifdef COMPUTE_U_AS_QUATERNION
1009     ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f;)                                           ENABLE_SSE_IMPLEMENTATION(Vqus=Vch;)                                                      ENABLE_AVX_IMPLEMENTATION(Vqus=Vch;)
1010     ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_xor_ps(Vquvx,Vquvx);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_xor_ps(Vquvx,Vquvx);)
1011     ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=0.;)                                             ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_xor_ps(Vquvy,Vquvy);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_xor_ps(Vquvy,Vquvy);)
1012     ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Ssh.f;)                                          ENABLE_SSE_IMPLEMENTATION(Vquvz=Vsh;)                                                     ENABLE_AVX_IMPLEMENTATION(Vquvz=Vsh;)
1013 #endif
1014 
1015     // Second Givens rotation
1016 
1017 #define SAPIVOT Sa11
1018 #define SANPIVOT Sa31
1019 #define SA11 Sa11
1020 #define SA21 Sa31
1021 #define SA12 Sa12
1022 #define SA22 Sa32
1023 #define SA13 Sa13
1024 #define SA23 Sa33
1025 #define SU11 Su11
1026 #define SU12 Su13
1027 #define SU21 Su21
1028 #define SU22 Su23
1029 #define SU31 Su31
1030 #define SU32 Su33
1031 
1032 #define VAPIVOT Va11
1033 #define VANPIVOT Va31
1034 #define VA11 Va11
1035 #define VA21 Va31
1036 #define VA12 Va12
1037 #define VA22 Va32
1038 #define VA13 Va13
1039 #define VA23 Va33
1040 #define VU11 Vu11
1041 #define VU12 Vu13
1042 #define VU21 Vu21
1043 #define VU22 Vu23
1044 #define VU31 Vu31
1045 #define VU32 Vu33
1046 
1047 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
1048 
1049 #undef SAPIVOT
1050 #undef SANPIVOT
1051 #undef SA11
1052 #undef SA21
1053 #undef SA12
1054 #undef SA22
1055 #undef SA13
1056 #undef SA23
1057 #undef SU11
1058 #undef SU12
1059 #undef SU21
1060 #undef SU22
1061 #undef SU31
1062 #undef SU32
1063 
1064 #undef VAPIVOT
1065 #undef VANPIVOT
1066 #undef VA11
1067 #undef VA21
1068 #undef VA12
1069 #undef VA22
1070 #undef VA13
1071 #undef VA23
1072 #undef VU11
1073 #undef VU12
1074 #undef VU21
1075 #undef VU22
1076 #undef VU31
1077 #undef VU32
1078 
1079     // Update quaternion representation of U
1080 
1081 #ifdef COMPUTE_U_AS_QUATERNION
1082     ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Ssh.f*Squvz.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vsh,Vquvz);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vsh,Vquvz);)
1083     ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;)                                     ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);)                                      ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
1084     ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f-Ssh.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_sub_ps(Vquvy,Vsh);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_sub_ps(Vquvy,Vsh);)
1085     ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;)                                    ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);)                                     ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
1086     ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
1087 #endif
1088 
1089     // Third Givens rotation
1090 
1091 #define SAPIVOT Sa22
1092 #define SANPIVOT Sa32
1093 #define SA11 Sa21
1094 #define SA21 Sa31
1095 #define SA12 Sa22
1096 #define SA22 Sa32
1097 #define SA13 Sa23
1098 #define SA23 Sa33
1099 #define SU11 Su12
1100 #define SU12 Su13
1101 #define SU21 Su22
1102 #define SU22 Su23
1103 #define SU31 Su32
1104 #define SU32 Su33
1105 
1106 #define VAPIVOT Va22
1107 #define VANPIVOT Va32
1108 #define VA11 Va21
1109 #define VA21 Va31
1110 #define VA12 Va22
1111 #define VA22 Va32
1112 #define VA13 Va23
1113 #define VA23 Va33
1114 #define VU11 Vu12
1115 #define VU12 Vu13
1116 #define VU21 Vu22
1117 #define VU22 Vu23
1118 #define VU31 Vu32
1119 #define VU32 Vu33
1120 
1121 #include "Singular_Value_Decomposition_Givens_QR_Factorization_Kernel.hpp"
1122 
1123 #undef SAPIVOT
1124 #undef SANPIVOT
1125 #undef SA11
1126 #undef SA21
1127 #undef SA12
1128 #undef SA22
1129 #undef SA13
1130 #undef SA23
1131 #undef SU11
1132 #undef SU12
1133 #undef SU21
1134 #undef SU22
1135 #undef SU31
1136 #undef SU32
1137 
1138 #undef VAPIVOT
1139 #undef VANPIVOT
1140 #undef VA11
1141 #undef VA21
1142 #undef VA12
1143 #undef VA22
1144 #undef VA13
1145 #undef VA23
1146 #undef VU11
1147 #undef VU12
1148 #undef VU21
1149 #undef VU22
1150 #undef VU31
1151 #undef VU32
1152 
1153     // Update quaternion representation of U
1154 
1155 #ifdef COMPUTE_U_AS_QUATERNION
1156     ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Squvx.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vquvx);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vquvx);)
1157     ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Squvy.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vquvy);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vquvy);)
1158     ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Squvz.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vquvz);)                                   ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vquvz);)
1159     ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Squs.f;)                                     ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqus);)                                      ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqus);)
1160     ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Sch.f*Squs.f;)                                    ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_mul_ps(Vch,Vqus);)                                     ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_mul_ps(Vch,Vqus);)
1161     ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Sch.f*Squvx.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_mul_ps(Vch,Vquvx);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_mul_ps(Vch,Vquvx);)
1162     ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Sch.f*Squvy.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_mul_ps(Vch,Vquvy);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_mul_ps(Vch,Vquvy);)
1163     ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Sch.f*Squvz.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_mul_ps(Vch,Vquvz);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_mul_ps(Vch,Vquvz);)
1164     ENABLE_SCALAR_IMPLEMENTATION(Squvx.f=Squvx.f+Ssh.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vquvx=_mm_add_ps(Vquvx,Vsh);)                                   ENABLE_AVX_IMPLEMENTATION(Vquvx=_mm256_add_ps(Vquvx,Vsh);)
1165     ENABLE_SCALAR_IMPLEMENTATION(Squs.f=Squs.f-Stmp1.f;)                                  ENABLE_SSE_IMPLEMENTATION(Vqus=_mm_sub_ps(Vqus,Vtmp1);)                                   ENABLE_AVX_IMPLEMENTATION(Vqus=_mm256_sub_ps(Vqus,Vtmp1);)
1166     ENABLE_SCALAR_IMPLEMENTATION(Squvy.f=Squvy.f+Stmp3.f;)                                ENABLE_SSE_IMPLEMENTATION(Vquvy=_mm_add_ps(Vquvy,Vtmp3);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvy=_mm256_add_ps(Vquvy,Vtmp3);)
1167     ENABLE_SCALAR_IMPLEMENTATION(Squvz.f=Squvz.f-Stmp2.f;)                                ENABLE_SSE_IMPLEMENTATION(Vquvz=_mm_sub_ps(Vquvz,Vtmp2);)                                 ENABLE_AVX_IMPLEMENTATION(Vquvz=_mm256_sub_ps(Vquvz,Vtmp2);)
1168 #endif
1169 
1170 #ifdef COMPUTE_U_AS_MATRIX
1171 #ifdef PRINT_DEBUGGING_OUTPUT
1172 #ifdef USE_SCALAR_IMPLEMENTATION
1173     std::cout<<"Scalar U ="<<std::endl;
1174     std::cout<<std::setw(12)<<Su11.f<<"  "<<std::setw(12)<<Su12.f<<"  "<<std::setw(12)<<Su13.f<<std::endl;
1175     std::cout<<std::setw(12)<<Su21.f<<"  "<<std::setw(12)<<Su22.f<<"  "<<std::setw(12)<<Su23.f<<std::endl;
1176     std::cout<<std::setw(12)<<Su31.f<<"  "<<std::setw(12)<<Su32.f<<"  "<<std::setw(12)<<Su33.f<<std::endl;
1177 #endif
1178 #ifdef USE_SSE_IMPLEMENTATION
1179     _mm_storeu_ps(buf,Vu11);U11=buf[0];
1180     _mm_storeu_ps(buf,Vu21);U21=buf[0];
1181     _mm_storeu_ps(buf,Vu31);U31=buf[0];
1182     _mm_storeu_ps(buf,Vu12);U12=buf[0];
1183     _mm_storeu_ps(buf,Vu22);U22=buf[0];
1184     _mm_storeu_ps(buf,Vu32);U32=buf[0];
1185     _mm_storeu_ps(buf,Vu13);U13=buf[0];
1186     _mm_storeu_ps(buf,Vu23);U23=buf[0];
1187     _mm_storeu_ps(buf,Vu33);U33=buf[0];
1188     std::cout<<"Vector U ="<<std::endl;
1189     std::cout<<std::setw(12)<<U11<<"  "<<std::setw(12)<<U12<<"  "<<std::setw(12)<<U13<<std::endl;
1190     std::cout<<std::setw(12)<<U21<<"  "<<std::setw(12)<<U22<<"  "<<std::setw(12)<<U23<<std::endl;
1191     std::cout<<std::setw(12)<<U31<<"  "<<std::setw(12)<<U32<<"  "<<std::setw(12)<<U33<<std::endl;
1192 #endif
1193 #ifdef USE_AVX_IMPLEMENTATION
1194     _mm256_storeu_ps(buf,Vu11);U11=buf[0];
1195     _mm256_storeu_ps(buf,Vu21);U21=buf[0];
1196     _mm256_storeu_ps(buf,Vu31);U31=buf[0];
1197     _mm256_storeu_ps(buf,Vu12);U12=buf[0];
1198     _mm256_storeu_ps(buf,Vu22);U22=buf[0];
1199     _mm256_storeu_ps(buf,Vu32);U32=buf[0];
1200     _mm256_storeu_ps(buf,Vu13);U13=buf[0];
1201     _mm256_storeu_ps(buf,Vu23);U23=buf[0];
1202     _mm256_storeu_ps(buf,Vu33);U33=buf[0];
1203     std::cout<<"Vector U ="<<std::endl;
1204     std::cout<<std::setw(12)<<U11<<"  "<<std::setw(12)<<U12<<"  "<<std::setw(12)<<U13<<std::endl;
1205     std::cout<<std::setw(12)<<U21<<"  "<<std::setw(12)<<U22<<"  "<<std::setw(12)<<U23<<std::endl;
1206     std::cout<<std::setw(12)<<U31<<"  "<<std::setw(12)<<U32<<"  "<<std::setw(12)<<U33<<std::endl;
1207 #endif
1208 #endif
1209 #endif
1210 
1211 #ifdef PRINT_DEBUGGING_OUTPUT
1212 #ifdef USE_SCALAR_IMPLEMENTATION
1213     std::cout<<"Scalar A (after multiplying with U-transpose and V) ="<<std::endl;
1214     std::cout<<std::setw(12)<<Sa11.f<<"  "<<std::setw(12)<<Sa12.f<<"  "<<std::setw(12)<<Sa13.f<<std::endl;
1215     std::cout<<std::setw(12)<<Sa21.f<<"  "<<std::setw(12)<<Sa22.f<<"  "<<std::setw(12)<<Sa23.f<<std::endl;
1216     std::cout<<std::setw(12)<<Sa31.f<<"  "<<std::setw(12)<<Sa32.f<<"  "<<std::setw(12)<<Sa33.f<<std::endl;
1217 #endif
1218 #ifdef USE_SSE_IMPLEMENTATION
1219     _mm_storeu_ps(buf,Va11);A11=buf[0];
1220     _mm_storeu_ps(buf,Va21);A21=buf[0];
1221     _mm_storeu_ps(buf,Va31);A31=buf[0];
1222     _mm_storeu_ps(buf,Va12);A12=buf[0];
1223     _mm_storeu_ps(buf,Va22);A22=buf[0];
1224     _mm_storeu_ps(buf,Va32);A32=buf[0];
1225     _mm_storeu_ps(buf,Va13);A13=buf[0];
1226     _mm_storeu_ps(buf,Va23);A23=buf[0];
1227     _mm_storeu_ps(buf,Va33);A33=buf[0];
1228     std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
1229     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
1230     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
1231     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
1232 #endif
1233 #ifdef USE_AVX_IMPLEMENTATION
1234     _mm256_storeu_ps(buf,Va11);A11=buf[0];
1235     _mm256_storeu_ps(buf,Va21);A21=buf[0];
1236     _mm256_storeu_ps(buf,Va31);A31=buf[0];
1237     _mm256_storeu_ps(buf,Va12);A12=buf[0];
1238     _mm256_storeu_ps(buf,Va22);A22=buf[0];
1239     _mm256_storeu_ps(buf,Va32);A32=buf[0];
1240     _mm256_storeu_ps(buf,Va13);A13=buf[0];
1241     _mm256_storeu_ps(buf,Va23);A23=buf[0];
1242     _mm256_storeu_ps(buf,Va33);A33=buf[0];
1243     std::cout<<"Vector A (after multiplying with U-transpose and V) ="<<std::endl;
1244     std::cout<<std::setw(12)<<A11<<"  "<<std::setw(12)<<A12<<"  "<<std::setw(12)<<A13<<std::endl;
1245     std::cout<<std::setw(12)<<A21<<"  "<<std::setw(12)<<A22<<"  "<<std::setw(12)<<A23<<std::endl;
1246     std::cout<<std::setw(12)<<A31<<"  "<<std::setw(12)<<A32<<"  "<<std::setw(12)<<A33<<std::endl;
1247 #endif
1248 #endif
1249 
1250 #ifdef COMPUTE_U_AS_QUATERNION
1251 #ifdef PRINT_DEBUGGING_OUTPUT
1252 #ifdef USE_SCALAR_IMPLEMENTATION
1253     std::cout<<"Scalar qU ="<<std::endl;
1254     std::cout<<std::setw(12)<<Squs.f<<"  "<<std::setw(12)<<Squvx.f<<"  "<<std::setw(12)<<Squvy.f<<"  "<<std::setw(12)<<Squvz.f<<std::endl;
1255 #endif
1256 #ifdef USE_SSE_IMPLEMENTATION
1257     _mm_storeu_ps(buf,Vqus);QUS=buf[0];
1258     _mm_storeu_ps(buf,Vquvx);QUVX=buf[0];
1259     _mm_storeu_ps(buf,Vquvy);QUVY=buf[0];
1260     _mm_storeu_ps(buf,Vquvz);QUVZ=buf[0];
1261     std::cout<<"Vector qU ="<<std::endl;
1262     std::cout<<std::setw(12)<<QUS<<"  "<<std::setw(12)<<QUVX<<"  "<<std::setw(12)<<QUVY<<"  "<<std::setw(12)<<QUVZ<<std::endl;
1263 #endif
1264 #ifdef USE_AVX_IMPLEMENTATION
1265     _mm256_storeu_ps(buf,Vqus);QUS=buf[0];
1266     _mm256_storeu_ps(buf,Vquvx);QUVX=buf[0];
1267     _mm256_storeu_ps(buf,Vquvy);QUVY=buf[0];
1268     _mm256_storeu_ps(buf,Vquvz);QUVZ=buf[0];
1269     std::cout<<"Vector qU ="<<std::endl;
1270     std::cout<<std::setw(12)<<QUS<<"  "<<std::setw(12)<<QUVX<<"  "<<std::setw(12)<<QUVY<<"  "<<std::setw(12)<<QUVZ<<std::endl;
1271 #endif
1272 #endif
1273 #endif
1274 
1275 #ifdef __INTEL_COMPILER
1276 #pragma warning( default : 592 )
1277 #endif
1278