1/// @ref core
2
3#if GLM_ARCH & GLM_ARCH_SSE2_BIT
4
5namespace glm{
6namespace detail
7{
8/*
9	template<qualifier Q>
10	struct compute_quat_mul<float, Q, true>
11	{
12		static qua<float, Q> call(qua<float, Q> const& q1, qua<float, Q> const& q2)
13		{
14			// SSE2 STATS: 11 shuffle, 8 mul, 8 add
15			// SSE4 STATS: 3 shuffle, 4 mul, 4 dpps
16
17			__m128 const mul0 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(0, 1, 2, 3)));
18			__m128 const mul1 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(1, 0, 3, 2)));
19			__m128 const mul2 = _mm_mul_ps(q1.Data, _mm_shuffle_ps(q2.Data, q2.Data, _MM_SHUFFLE(2, 3, 0, 1)));
20			__m128 const mul3 = _mm_mul_ps(q1.Data, q2.Data);
21
22#			if GLM_ARCH & GLM_ARCH_SSE41_BIT
23				__m128 const add0 = _mm_dp_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f), 0xff);
24				__m128 const add1 = _mm_dp_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f), 0xff);
25				__m128 const add2 = _mm_dp_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f), 0xff);
26				__m128 const add3 = _mm_dp_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f), 0xff);
27#			else
28				__m128 const mul4 = _mm_mul_ps(mul0, _mm_set_ps(1.0f, -1.0f,  1.0f,  1.0f));
29				__m128 const add0 = _mm_add_ps(mul0, _mm_movehl_ps(mul4, mul4));
30				__m128 const add4 = _mm_add_ss(add0, _mm_shuffle_ps(add0, add0, 1));
31
32				__m128 const mul5 = _mm_mul_ps(mul1, _mm_set_ps(1.0f,  1.0f,  1.0f, -1.0f));
33				__m128 const add1 = _mm_add_ps(mul1, _mm_movehl_ps(mul5, mul5));
34				__m128 const add5 = _mm_add_ss(add1, _mm_shuffle_ps(add1, add1, 1));
35
36				__m128 const mul6 = _mm_mul_ps(mul2, _mm_set_ps(1.0f,  1.0f, -1.0f,  1.0f));
37				__m128 const add2 = _mm_add_ps(mul6, _mm_movehl_ps(mul6, mul6));
38				__m128 const add6 = _mm_add_ss(add2, _mm_shuffle_ps(add2, add2, 1));
39
40				__m128 const mul7 = _mm_mul_ps(mul3, _mm_set_ps(1.0f, -1.0f, -1.0f, -1.0f));
41				__m128 const add3 = _mm_add_ps(mul3, _mm_movehl_ps(mul7, mul7));
42				__m128 const add7 = _mm_add_ss(add3, _mm_shuffle_ps(add3, add3, 1));
43		#endif
44
45			// This SIMD code is a politically correct way of doing this, but in every test I've tried it has been slower than
46			// the final code below. I'll keep this here for reference - maybe somebody else can do something better...
47			//
48			//__m128 xxyy = _mm_shuffle_ps(add4, add5, _MM_SHUFFLE(0, 0, 0, 0));
49			//__m128 zzww = _mm_shuffle_ps(add6, add7, _MM_SHUFFLE(0, 0, 0, 0));
50			//
51			//return _mm_shuffle_ps(xxyy, zzww, _MM_SHUFFLE(2, 0, 2, 0));
52
53			qua<float, Q> Result;
54			_mm_store_ss(&Result.x, add4);
55			_mm_store_ss(&Result.y, add5);
56			_mm_store_ss(&Result.z, add6);
57			_mm_store_ss(&Result.w, add7);
58			return Result;
59		}
60	};
61*/
62
63	template<qualifier Q>
64	struct compute_quat_add<float, Q, true>
65	{
66		static qua<float, Q> call(qua<float, Q> const& q, qua<float, Q> const& p)
67		{
68			qua<float, Q> Result;
69			Result.data = _mm_add_ps(q.data, p.data);
70			return Result;
71		}
72	};
73
74#	if GLM_ARCH & GLM_ARCH_AVX_BIT
75	template<qualifier Q>
76	struct compute_quat_add<double, Q, true>
77	{
78		static qua<double, Q> call(qua<double, Q> const& a, qua<double, Q> const& b)
79		{
80			qua<double, Q> Result;
81			Result.data = _mm256_add_pd(a.data, b.data);
82			return Result;
83		}
84	};
85#	endif
86
87	template<qualifier Q>
88	struct compute_quat_sub<float, Q, true>
89	{
90		static qua<float, Q> call(qua<float, Q> const& q, qua<float, Q> const& p)
91		{
92			vec<4, float, Q> Result;
93			Result.data = _mm_sub_ps(q.data, p.data);
94			return Result;
95		}
96	};
97
98#	if GLM_ARCH & GLM_ARCH_AVX_BIT
99	template<qualifier Q>
100	struct compute_quat_sub<double, Q, true>
101	{
102		static qua<double, Q> call(qua<double, Q> const& a, qua<double, Q> const& b)
103		{
104			qua<double, Q> Result;
105			Result.data = _mm256_sub_pd(a.data, b.data);
106			return Result;
107		}
108	};
109#	endif
110
111	template<qualifier Q>
112	struct compute_quat_mul_scalar<float, Q, true>
113	{
114		static qua<float, Q> call(qua<float, Q> const& q, float s)
115		{
116			vec<4, float, Q> Result;
117			Result.data = _mm_mul_ps(q.data, _mm_set_ps1(s));
118			return Result;
119		}
120	};
121
122#	if GLM_ARCH & GLM_ARCH_AVX_BIT
123	template<qualifier Q>
124	struct compute_quat_mul_scalar<double, Q, true>
125	{
126		static qua<double, Q> call(qua<double, Q> const& q, double s)
127		{
128			qua<double, Q> Result;
129			Result.data = _mm256_mul_pd(q.data, _mm_set_ps1(s));
130			return Result;
131		}
132	};
133#	endif
134
135	template<qualifier Q>
136	struct compute_quat_div_scalar<float, Q, true>
137	{
138		static qua<float, Q> call(qua<float, Q> const& q, float s)
139		{
140			vec<4, float, Q> Result;
141			Result.data = _mm_div_ps(q.data, _mm_set_ps1(s));
142			return Result;
143		}
144	};
145
146#	if GLM_ARCH & GLM_ARCH_AVX_BIT
147	template<qualifier Q>
148	struct compute_quat_div_scalar<double, Q, true>
149	{
150		static qua<double, Q> call(qua<double, Q> const& q, double s)
151		{
152			qua<double, Q> Result;
153			Result.data = _mm256_div_pd(q.data, _mm_set_ps1(s));
154			return Result;
155		}
156	};
157#	endif
158
159	template<qualifier Q>
160	struct compute_quat_mul_vec4<float, Q, true>
161	{
162		static vec<4, float, Q> call(qua<float, Q> const& q, vec<4, float, Q> const& v)
163		{
164			__m128 const q_wwww = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 3, 3, 3));
165			__m128 const q_swp0 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 0, 2, 1));
166			__m128 const q_swp1 = _mm_shuffle_ps(q.data, q.data, _MM_SHUFFLE(3, 1, 0, 2));
167			__m128 const v_swp0 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 0, 2, 1));
168			__m128 const v_swp1 = _mm_shuffle_ps(v.data, v.data, _MM_SHUFFLE(3, 1, 0, 2));
169
170			__m128 uv      = _mm_sub_ps(_mm_mul_ps(q_swp0, v_swp1), _mm_mul_ps(q_swp1, v_swp0));
171			__m128 uv_swp0 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 0, 2, 1));
172			__m128 uv_swp1 = _mm_shuffle_ps(uv, uv, _MM_SHUFFLE(3, 1, 0, 2));
173			__m128 uuv     = _mm_sub_ps(_mm_mul_ps(q_swp0, uv_swp1), _mm_mul_ps(q_swp1, uv_swp0));
174
175			__m128 const two = _mm_set1_ps(2.0f);
176			uv  = _mm_mul_ps(uv, _mm_mul_ps(q_wwww, two));
177			uuv = _mm_mul_ps(uuv, two);
178
179			vec<4, float, Q> Result;
180			Result.data = _mm_add_ps(v.Data, _mm_add_ps(uv, uuv));
181			return Result;
182		}
183	};
184}//namespace detail
185}//namespace glm
186
187#endif//GLM_ARCH & GLM_ARCH_SSE2_BIT
188
189