1/// @ref core 2 3#if GLM_ARCH & GLM_ARCH_SSE2_BIT 4 5namespace glm{ 6namespace detail 7{ 8/* 9 template<qualifier Q> 10 struct compute_quat_mul<float, Q, true> 11 { 12 static qua<float, Q> call(qua<float, Q> const& q1, qua<float, Q> const& q2) 13 { 14 // SSE2 STATS: 11 shuffle, 8 mul, 8 add 15 // SSE4 STATS: 3 shuffle, 4 mul, 4 dpps 16 17 __m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3))); 18 __m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2))); 19 __m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1))); 20 __m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data); 21 22# if GLM_ARCH & GLM_ARCH_SSE41_BIT 23 __m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f), 0xff); 24 __m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f), 0xff); 25 __m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f), 0xff); 26 __m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff); 27# else 28 __m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f, 1.0f, 1.0f)); 29 __m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4)); 30 __m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1)); 31 32 __m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f, 1.0f, 1.0f, -1.0f)); 33 __m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5)); 34 __m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1)); 35 36 __m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f, 1.0f, -1.0f, 1.0f)); 37 __m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6)); 38 __m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1)); 39 40 __m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f)); 41 __m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7)); 42 __m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1)); 43 #endif 44 45 // This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than 46 // the final code below. I'll keep this here for reference - maybe somebody else can do something better... 47 // 48 //__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0)); 49 //__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0)); 50 // 51 //return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0)); 52 53 qua<float, Q> Result; 54 _mm_store_ss(&Result.x, add4); 55 _mm_store_ss(&Result.y, add5); 56 _mm_store_ss(&Result.z, add6); 57 _mm_store_ss(&Result.w, add7); 58 return Result; 59 } 60 }; 61*/ 62 63 template<qualifier Q> 64 struct compute_quat_add<float, Q, true> 65 { 66 static qua<float, Q> call(qua<float, Q> const& q, qua<float, Q> const& p) 67 { 68 qua<float, Q> Result; 69 Result.data = _mm_add_ps(q.data, p.data); 70 return Result; 71 } 72 }; 73 74# if GLM_ARCH & GLM_ARCH_AVX_BIT 75 template<qualifier Q> 76 struct compute_quat_add<double, Q, true> 77 { 78 static qua<double, Q> call(qua<double, Q> const& a, qua<double, Q> const& b) 79 { 80 qua<double, Q> Result; 81 Result.data = _mm256_add_pd(a.data, b.data); 82 return Result; 83 } 84 }; 85# endif 86 87 template<qualifier Q> 88 struct compute_quat_sub<float, Q, true> 89 { 90 static qua<float, Q> call(qua<float, Q> const& q, qua<float, Q> const& p) 91 { 92 vec<4, float, Q> Result; 93 Result.data = _mm_sub_ps(q.data, p.data); 94 return Result; 95 } 96 }; 97 98# if GLM_ARCH & GLM_ARCH_AVX_BIT 99 template<qualifier Q> 100 struct compute_quat_sub<double, Q, true> 101 { 102 static qua<double, Q> call(qua<double, Q> const& a, qua<double, Q> const& b) 103 { 104 qua<double, Q> Result; 105 Result.data = _mm256_sub_pd(a.data, b.data); 106 return Result; 107 } 108 }; 109# endif 110 111 template<qualifier Q> 112 struct compute_quat_mul_scalar<float, Q, true> 113 { 114 static qua<float, Q> call(qua<float, Q> const& q, float s) 115 { 116 vec<4, float, Q> Result; 117 Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s)); 118 return Result; 119 } 120 }; 121 122# if GLM_ARCH & GLM_ARCH_AVX_BIT 123 template<qualifier Q> 124 struct compute_quat_mul_scalar<double, Q, true> 125 { 126 static qua<double, Q> call(qua<double, Q> const& q, double s) 127 { 128 qua<double, Q> Result; 129 Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s)); 130 return Result; 131 } 132 }; 133# endif 134 135 template<qualifier Q> 136 struct compute_quat_div_scalar<float, Q, true> 137 { 138 static qua<float, Q> call(qua<float, Q> const& q, float s) 139 { 140 vec<4, float, Q> Result; 141 Result.data = _mm_div_ps(q.data, _mm_set_ps1(s)); 142 return Result; 143 } 144 }; 145 146# if GLM_ARCH & GLM_ARCH_AVX_BIT 147 template<qualifier Q> 148 struct compute_quat_div_scalar<double, Q, true> 149 { 150 static qua<double, Q> call(qua<double, Q> const& q, double s) 151 { 152 qua<double, Q> Result; 153 Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s)); 154 return Result; 155 } 156 }; 157# endif 158 159 template<qualifier Q> 160 struct compute_quat_mul_vec4<float, Q, true> 161 { 162 static vec<4, float, Q> call(qua<float, Q> const& q, vec<4, float, Q> const& v) 163 { 164 __m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3)); 165 __m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1)); 166 __m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2)); 167 __m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1)); 168 __m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2)); 169 170 __m128 uv = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0)); 171 __m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1)); 172 __m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2)); 173 __m128 uuv = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0)); 174 175 __m128 const two = _mm_set1_ps(2.0f); 176 uv = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two)); 177 uuv = _mm_mul_ps(uuv, two); 178 179 vec<4, float, Q> Result; 180 Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv)); 181 return Result; 182 } 183 }; 184}//namespace detail 185}//namespace glm 186 187#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT 188 189