1#if GLM_ARCH & GLM_ARCH_SSE2_BIT
2
3#include "type_mat4x4.hpp"
4#include "../geometric.hpp"
5#include "../simd/matrix.h"
6#include <cstring>
7
8namespace glm{
9namespace detail
10{
11#	if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE
12	template<qualifier Q>
13	struct compute_matrixCompMult<4, 4, float, Q, true>
14	{
15		GLM_STATIC_ASSERT(detail::is_aligned<Q>::value, "Specialization requires aligned");
16
17		GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& x, mat<4, 4, float, Q> const& y)
18		{
19			mat<4, 4, float, Q> Result;
20			glm_mat4_matrixCompMult(
21				*static_cast<glm_vec4 const (*)[4]>(&x[0].data),
22				*static_cast<glm_vec4 const (*)[4]>(&y[0].data),
23				*static_cast<glm_vec4(*)[4]>(&Result[0].data));
24			return Result;
25		}
26	};
27#	endif
28
29	template<qualifier Q>
30	struct compute_transpose<4, 4, float, Q, true>
31	{
32		GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
33		{
34			mat<4, 4, float, Q> Result;
35			glm_mat4_transpose(&m[0].data, &Result[0].data);
36			return Result;
37		}
38	};
39
40	template<qualifier Q>
41	struct compute_determinant<4, 4, float, Q, true>
42	{
43		GLM_FUNC_QUALIFIER static float call(mat<4, 4, float, Q> const& m)
44		{
45			return _mm_cvtss_f32(glm_mat4_determinant(&m[0].data));
46		}
47	};
48
49	template<qualifier Q>
50	struct compute_inverse<4, 4, float, Q, true>
51	{
52		GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
53		{
54			mat<4, 4, float, Q> Result;
55			glm_mat4_inverse(&m[0].data, &Result[0].data);
56			return Result;
57		}
58	};
59}//namespace detail
60
61#	if GLM_CONFIG_ALIGNED_GENTYPES == GLM_ENABLE
62	template<>
63	GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_lowp> outerProduct<4, 4, float, aligned_lowp>(vec<4, float, aligned_lowp> const& c, vec<4, float, aligned_lowp> const& r)
64	{
65		__m128 NativeResult[4];
66		glm_mat4_outerProduct(c.data, r.data, NativeResult);
67		mat<4, 4, float, aligned_lowp> Result;
68		std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
69		return Result;
70	}
71
72	template<>
73	GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_mediump> outerProduct<4, 4, float, aligned_mediump>(vec<4, float, aligned_mediump> const& c, vec<4, float, aligned_mediump> const& r)
74	{
75		__m128 NativeResult[4];
76		glm_mat4_outerProduct(c.data, r.data, NativeResult);
77		mat<4, 4, float, aligned_mediump> Result;
78		std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
79		return Result;
80	}
81
82	template<>
83	GLM_FUNC_QUALIFIER mat<4, 4, float, aligned_highp> outerProduct<4, 4, float, aligned_highp>(vec<4, float, aligned_highp> const& c, vec<4, float, aligned_highp> const& r)
84	{
85		__m128 NativeResult[4];
86		glm_mat4_outerProduct(c.data, r.data, NativeResult);
87		mat<4, 4, float, aligned_highp> Result;
88		std::memcpy(&Result[0], &NativeResult[0], sizeof(Result));
89		return Result;
90	}
91#	endif
92}//namespace glm
93
94#elif GLM_ARCH & GLM_ARCH_NEON_BIT
95
96namespace glm {
97#if GLM_LANG & GLM_LANG_CXX11_FLAG
98	template <qualifier Q>
99	GLM_FUNC_QUALIFIER
100	typename std::enable_if<detail::is_aligned<Q>::value, mat<4, 4, float, Q>>::type
101	operator*(mat<4, 4, float, Q> const & m1, mat<4, 4, float, Q> const & m2)
102	{
103		auto MulRow = [&](int l) {
104			float32x4_t const SrcA = m2[l].data;
105
106			float32x4_t r = neon::mul_lane(m1[0].data, SrcA, 0);
107			r = neon::madd_lane(r, m1[1].data, SrcA, 1);
108			r = neon::madd_lane(r, m1[2].data, SrcA, 2);
109			r = neon::madd_lane(r, m1[3].data, SrcA, 3);
110
111			return r;
112		};
113
114		mat<4, 4, float, aligned_highp> Result;
115		Result[0].data = MulRow(0);
116		Result[1].data = MulRow(1);
117		Result[2].data = MulRow(2);
118		Result[3].data = MulRow(3);
119
120		return Result;
121	}
122#endif // CXX11
123
124	template<qualifier Q>
125	struct detail::compute_inverse<4, 4, float, Q, true>
126	{
127		GLM_FUNC_QUALIFIER static mat<4, 4, float, Q> call(mat<4, 4, float, Q> const& m)
128		{
129			float32x4_t const& m0 = m[0].data;
130			float32x4_t const& m1 = m[1].data;
131			float32x4_t const& m2 = m[2].data;
132			float32x4_t const& m3 = m[3].data;
133
134			// m[2][2] * m[3][3] - m[3][2] * m[2][3];
135			// m[2][2] * m[3][3] - m[3][2] * m[2][3];
136			// m[1][2] * m[3][3] - m[3][2] * m[1][3];
137			// m[1][2] * m[2][3] - m[2][2] * m[1][3];
138
139			float32x4_t Fac0;
140			{
141				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
142				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
143				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
144				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
145				Fac0 = w0 * w1 -  w2 * w3;
146			}
147
148			// m[2][1] * m[3][3] - m[3][1] * m[2][3];
149			// m[2][1] * m[3][3] - m[3][1] * m[2][3];
150			// m[1][1] * m[3][3] - m[3][1] * m[1][3];
151			// m[1][1] * m[2][3] - m[2][1] * m[1][3];
152
153			float32x4_t Fac1;
154			{
155				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
156				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
157				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
158				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
159				Fac1 = w0 * w1 - w2 * w3;
160			}
161
162			// m[2][1] * m[3][2] - m[3][1] * m[2][2];
163			// m[2][1] * m[3][2] - m[3][1] * m[2][2];
164			// m[1][1] * m[3][2] - m[3][1] * m[1][2];
165			// m[1][1] * m[2][2] - m[2][1] * m[1][2];
166
167			float32x4_t Fac2;
168			{
169				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
170				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
171				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
172				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
173				Fac2 = w0 * w1 - w2 * w3;
174			}
175
176			// m[2][0] * m[3][3] - m[3][0] * m[2][3];
177			// m[2][0] * m[3][3] - m[3][0] * m[2][3];
178			// m[1][0] * m[3][3] - m[3][0] * m[1][3];
179			// m[1][0] * m[2][3] - m[2][0] * m[1][3];
180
181			float32x4_t Fac3;
182			{
183				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
184				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 3), 3, m2, 3);
185				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
186				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 3), neon::dup_lane(m1, 3));
187				Fac3 = w0 * w1 - w2 * w3;
188			}
189
190			// m[2][0] * m[3][2] - m[3][0] * m[2][2];
191			// m[2][0] * m[3][2] - m[3][0] * m[2][2];
192			// m[1][0] * m[3][2] - m[3][0] * m[1][2];
193			// m[1][0] * m[2][2] - m[2][0] * m[1][2];
194
195			float32x4_t Fac4;
196			{
197				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
198				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 2), 3, m2, 2);
199				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
200				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 2), neon::dup_lane(m1, 2));
201				Fac4 = w0 * w1 - w2 * w3;
202			}
203
204			// m[2][0] * m[3][1] - m[3][0] * m[2][1];
205			// m[2][0] * m[3][1] - m[3][0] * m[2][1];
206			// m[1][0] * m[3][1] - m[3][0] * m[1][1];
207			// m[1][0] * m[2][1] - m[2][0] * m[1][1];
208
209			float32x4_t Fac5;
210			{
211				float32x4_t w0 = vcombine_f32(neon::dup_lane(m2, 0), neon::dup_lane(m1, 0));
212				float32x4_t w1 = neon::copy_lane(neon::dupq_lane(m3, 1), 3, m2, 1);
213				float32x4_t w2 = neon::copy_lane(neon::dupq_lane(m3, 0), 3, m2, 0);
214				float32x4_t w3 = vcombine_f32(neon::dup_lane(m2, 1), neon::dup_lane(m1, 1));
215				Fac5 = w0 * w1 - w2 * w3;
216			}
217
218			float32x4_t Vec0 = neon::copy_lane(neon::dupq_lane(m0, 0), 0, m1, 0); // (m[1][0], m[0][0], m[0][0], m[0][0]);
219			float32x4_t Vec1 = neon::copy_lane(neon::dupq_lane(m0, 1), 0, m1, 1); // (m[1][1], m[0][1], m[0][1], m[0][1]);
220			float32x4_t Vec2 = neon::copy_lane(neon::dupq_lane(m0, 2), 0, m1, 2); // (m[1][2], m[0][2], m[0][2], m[0][2]);
221			float32x4_t Vec3 = neon::copy_lane(neon::dupq_lane(m0, 3), 0, m1, 3); // (m[1][3], m[0][3], m[0][3], m[0][3]);
222
223			float32x4_t Inv0 = Vec1 * Fac0 - Vec2 * Fac1 + Vec3 * Fac2;
224			float32x4_t Inv1 = Vec0 * Fac0 - Vec2 * Fac3 + Vec3 * Fac4;
225			float32x4_t Inv2 = Vec0 * Fac1 - Vec1 * Fac3 + Vec3 * Fac5;
226			float32x4_t Inv3 = Vec0 * Fac2 - Vec1 * Fac4 + Vec2 * Fac5;
227
228			float32x4_t r0 = float32x4_t{-1, +1, -1, +1} * Inv0;
229			float32x4_t r1 = float32x4_t{+1, -1, +1, -1} * Inv1;
230			float32x4_t r2 = float32x4_t{-1, +1, -1, +1} * Inv2;
231			float32x4_t r3 = float32x4_t{+1, -1, +1, -1} * Inv3;
232
233			float32x4_t det = neon::mul_lane(r0, m0, 0);
234			det = neon::madd_lane(det, r1, m0, 1);
235			det = neon::madd_lane(det, r2, m0, 2);
236			det = neon::madd_lane(det, r3, m0, 3);
237
238			float32x4_t rdet = vdupq_n_f32(1 / vgetq_lane_f32(det, 0));
239
240			mat<4, 4, float, Q> r;
241			r[0].data = vmulq_f32(r0, rdet);
242			r[1].data = vmulq_f32(r1, rdet);
243			r[2].data = vmulq_f32(r2, rdet);
244			r[3].data = vmulq_f32(r3, rdet);
245			return r;
246		}
247	};
248}//namespace glm
249#endif
250