1 //##################################################################### 2 // Copyright (c) 2010-2011, Eftychios Sifakis. 3 // 4 // Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: 5 // * Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 6 // * Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or 7 // other materials provided with the distribution. 8 // 9 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, 10 // BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT 11 // SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 12 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 13 // INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 14 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 15 //##################################################################### 16 17 //########################################################### 18 // Compute the Givens angle (and half-angle) 19 //########################################################### 20 21 ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=SS21.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(VS21,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(VS21,Vone_half);) 22 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=SS11.f-SS22.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_sub_ps(VS11,VS22);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_sub_ps(VS11,VS22);) 23 24 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vsh);) 25 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f>=Stiny_number.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmpge_ps(Vtmp2,Vtiny_number);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtiny_number, _CMP_GE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmpge_ps(Vtmp2,Vtiny_number);) 26 ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_and_ps(Vtmp1,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_and_ps(Vtmp1,Vsh);) 27 ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Stmp1.ui&Stmp5.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_and_ps(Vtmp1,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vone,Vtmp5,Vtmp1);) 28 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=~Stmp1.ui&Sone.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_andnot_ps(Vtmp1,Vone);) 29 ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);) 30 31 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);) 32 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);) 33 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);) 34 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=rsqrt(Stmp3.f);) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_rsqrt_ps(Vtmp3);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_rsqrt_ps(Vtmp3);) 35 36 #ifdef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 37 ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Stmp4.f*Sone_half.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vtmp4,Vone_half);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vtmp4,Vone_half);) 38 ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vs);) 39 ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp4.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp4,Vc);) 40 ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp3.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_mul_ps(Vtmp3,Vc);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_mul_ps(Vtmp3,Vc);) 41 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_add_ps(Vtmp4,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_add_ps(Vtmp4,Vs);) 42 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vc);) 43 #endif 44 45 ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Stmp4.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vtmp4,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vtmp4,Vsh);) 46 ENABLE_SCALAR_IMPLEMENTATION(Sch.f=Stmp4.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_mul_ps(Vtmp4,Vch);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_mul_ps(Vtmp4,Vch);) 47 48 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Sfour_gamma_squared.f*Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vfour_gamma_squared,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vfour_gamma_squared,Vtmp1);) 49 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.ui=(Stmp2.f<=Stmp1.f)?0xffffffff:0;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_cmple_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmp_ps(Vtmp2,Vtmp1, _CMP_LE_OS);) //ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_cmple_ps(Vtmp2,Vtmp1);) 50 51 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Ssine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vsine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_blendv_ps(Vsh,Vsine_pi_over_eight,Vtmp1);) 52 ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=~Stmp1.ui&Ssh.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_andnot_ps(Vtmp1,Vsh);) 53 ENABLE_SCALAR_IMPLEMENTATION(Ssh.ui=Ssh.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_or_ps(Vsh,Vtmp2);) 54 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.ui=Scosine_pi_over_eight.ui&Stmp1.ui;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_and_ps(Vcosine_pi_over_eight,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vch=_mm256_blendv_ps(Vch,Vcosine_pi_over_eight,Vtmp1);) 55 ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=~Stmp1.ui&Sch.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_andnot_ps(Vtmp1,Vch);) 56 ENABLE_SCALAR_IMPLEMENTATION(Sch.ui=Sch.ui|Stmp2.ui;) ENABLE_SSE_IMPLEMENTATION(Vch=_mm_or_ps(Vch,Vtmp2);) 57 58 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vsh);) 59 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Sch.f*Sch.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vch,Vch);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vch,Vch);) 60 ENABLE_SCALAR_IMPLEMENTATION(Sc.f=Stmp2.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(Vc=_mm_sub_ps(Vtmp2,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(Vc=_mm256_sub_ps(Vtmp2,Vtmp1);) 61 ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Sch.f*Ssh.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_mul_ps(Vch,Vsh);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_mul_ps(Vch,Vsh);) 62 ENABLE_SCALAR_IMPLEMENTATION(Ss.f=Ss.f+Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vs=_mm_add_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vs=_mm256_add_ps(Vs,Vs);) 63 64 //########################################################### 65 // Perform the actual Givens conjugation 66 //########################################################### 67 68 #ifndef USE_ACCURATE_RSQRT_IN_JACOBI_CONJUGATION 69 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Stmp1.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_add_ps(Vtmp1,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_add_ps(Vtmp1,Vtmp2);) 70 ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);) 71 ENABLE_SCALAR_IMPLEMENTATION(SS31.f=SS31.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(VS31,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(VS31,Vtmp3);) 72 ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(VS32,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(VS32,Vtmp3);) 73 ENABLE_SCALAR_IMPLEMENTATION(SS33.f=SS33.f*Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS33=_mm_mul_ps(VS33,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS33=_mm256_mul_ps(VS33,Vtmp3);) 74 #endif 75 76 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ss.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vs,VS31);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vs,VS31);) 77 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,VS32);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,VS32);) 78 ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Sc.f*SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_mul_ps(Vc,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_mul_ps(Vc,VS31);) 79 ENABLE_SCALAR_IMPLEMENTATION(SS32.f=Sc.f*SS32.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_mul_ps(Vc,VS32);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_mul_ps(Vc,VS32);) 80 ENABLE_SCALAR_IMPLEMENTATION(SS31.f=Stmp2.f+SS31.f;) ENABLE_SSE_IMPLEMENTATION(VS31=_mm_add_ps(Vtmp2,VS31);) ENABLE_AVX_IMPLEMENTATION(VS31=_mm256_add_ps(Vtmp2,VS31);) 81 ENABLE_SCALAR_IMPLEMENTATION(SS32.f=SS32.f-Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS32=_mm_sub_ps(VS32,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS32=_mm256_sub_ps(VS32,Vtmp1);) 82 83 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ss.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vs,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vs,Vs);) 84 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=SS22.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(VS22,Vtmp2);) 85 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=SS11.f*Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(VS11,Vtmp2);) 86 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Sc.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vc);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vc);) 87 ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_mul_ps(VS11,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_mul_ps(VS11,Vtmp4);) 88 ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_mul_ps(VS22,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_mul_ps(VS22,Vtmp4);) 89 ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp1.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp1);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp1);) 90 ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f+Stmp3.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_add_ps(VS22,Vtmp3);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_add_ps(VS22,Vtmp3);) 91 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Stmp4.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_sub_ps(Vtmp4,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_sub_ps(Vtmp4,Vtmp2);) 92 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=SS21.f+SS21.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_add_ps(VS21,VS21);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_add_ps(VS21,VS21);) 93 ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_mul_ps(VS21,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_mul_ps(VS21,Vtmp4);) 94 ENABLE_SCALAR_IMPLEMENTATION(Stmp4.f=Sc.f*Ss.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp4=_mm_mul_ps(Vc,Vs);) ENABLE_AVX_IMPLEMENTATION(Vtmp4=_mm256_mul_ps(Vc,Vs);) 95 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Stmp2.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vtmp2,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vtmp2,Vtmp4);) 96 ENABLE_SCALAR_IMPLEMENTATION(Stmp5.f=Stmp5.f*Stmp4.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp5=_mm_mul_ps(Vtmp5,Vtmp4);) ENABLE_AVX_IMPLEMENTATION(Vtmp5=_mm256_mul_ps(Vtmp5,Vtmp4);) 97 ENABLE_SCALAR_IMPLEMENTATION(SS11.f=SS11.f+Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS11=_mm_add_ps(VS11,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS11=_mm256_add_ps(VS11,Vtmp2);) 98 ENABLE_SCALAR_IMPLEMENTATION(SS21.f=SS21.f-Stmp5.f;) ENABLE_SSE_IMPLEMENTATION(VS21=_mm_sub_ps(VS21,Vtmp5);) ENABLE_AVX_IMPLEMENTATION(VS21=_mm256_sub_ps(VS21,Vtmp5);) 99 ENABLE_SCALAR_IMPLEMENTATION(SS22.f=SS22.f-Stmp2.f;) ENABLE_SSE_IMPLEMENTATION(VS22=_mm_sub_ps(VS22,Vtmp2);) ENABLE_AVX_IMPLEMENTATION(VS22=_mm256_sub_ps(VS22,Vtmp2);) 100 101 //########################################################### 102 // Compute the cumulative rotation, in quaternion form 103 //########################################################### 104 105 ENABLE_SCALAR_IMPLEMENTATION(Stmp1.f=Ssh.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp1=_mm_mul_ps(Vsh,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vtmp1=_mm256_mul_ps(Vsh,Vqvvx);) 106 ENABLE_SCALAR_IMPLEMENTATION(Stmp2.f=Ssh.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp2=_mm_mul_ps(Vsh,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vtmp2=_mm256_mul_ps(Vsh,Vqvvy);) 107 ENABLE_SCALAR_IMPLEMENTATION(Stmp3.f=Ssh.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vtmp3=_mm_mul_ps(Vsh,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vtmp3=_mm256_mul_ps(Vsh,Vqvvz);) 108 ENABLE_SCALAR_IMPLEMENTATION(Ssh.f=Ssh.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vsh=_mm_mul_ps(Vsh,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vsh=_mm256_mul_ps(Vsh,Vqvs);) 109 110 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sch.f*Sqvs.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_mul_ps(Vch,Vqvs);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_mul_ps(Vch,Vqvs);) 111 ENABLE_SCALAR_IMPLEMENTATION(Sqvvx.f=Sch.f*Sqvvx.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvx=_mm_mul_ps(Vch,Vqvvx);) ENABLE_AVX_IMPLEMENTATION(Vqvvx=_mm256_mul_ps(Vch,Vqvvx);) 112 ENABLE_SCALAR_IMPLEMENTATION(Sqvvy.f=Sch.f*Sqvvy.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvy=_mm_mul_ps(Vch,Vqvvy);) ENABLE_AVX_IMPLEMENTATION(Vqvvy=_mm256_mul_ps(Vch,Vqvvy);) 113 ENABLE_SCALAR_IMPLEMENTATION(Sqvvz.f=Sch.f*Sqvvz.f;) ENABLE_SSE_IMPLEMENTATION(Vqvvz=_mm_mul_ps(Vch,Vqvvz);) ENABLE_AVX_IMPLEMENTATION(Vqvvz=_mm256_mul_ps(Vch,Vqvvz);) 114 115 ENABLE_SCALAR_IMPLEMENTATION(SQVVZ.f=SQVVZ.f+Ssh.f;) ENABLE_SSE_IMPLEMENTATION(VQVVZ=_mm_add_ps(VQVVZ,Vsh);) ENABLE_AVX_IMPLEMENTATION(VQVVZ=_mm256_add_ps(VQVVZ,Vsh);) 116 ENABLE_SCALAR_IMPLEMENTATION(Sqvs.f=Sqvs.f-STMP3.f;) ENABLE_SSE_IMPLEMENTATION(Vqvs=_mm_sub_ps(Vqvs,VTMP3);) ENABLE_AVX_IMPLEMENTATION(Vqvs=_mm256_sub_ps(Vqvs,VTMP3);) 117 ENABLE_SCALAR_IMPLEMENTATION(SQVVX.f=SQVVX.f+STMP2.f;) ENABLE_SSE_IMPLEMENTATION(VQVVX=_mm_add_ps(VQVVX,VTMP2);) ENABLE_AVX_IMPLEMENTATION(VQVVX=_mm256_add_ps(VQVVX,VTMP2);) 118 ENABLE_SCALAR_IMPLEMENTATION(SQVVY.f=SQVVY.f-STMP1.f;) ENABLE_SSE_IMPLEMENTATION(VQVVY=_mm_sub_ps(VQVVY,VTMP1);) ENABLE_AVX_IMPLEMENTATION(VQVVY=_mm256_sub_ps(VQVVY,VTMP1);) 119