1 /*
2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans  http://continuousphysics.com/Bullet/
3 
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9 
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 
15 #ifndef BT_MATRIX3x3_H
16 #define BT_MATRIX3x3_H
17 
18 #include "btVector3.h"
19 #include "btQuaternion.h"
20 #include <stdio.h>
21 
22 #ifdef BT_USE_SSE
23 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
24 //const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
25 #define vMPPP (_mm_set_ps(+0.0f, +0.0f, +0.0f, -0.0f))
26 #endif
27 
28 #if defined(BT_USE_SSE)
29 #define v1000 (_mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f))
30 #define v0100 (_mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f))
31 #define v0010 (_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f))
32 #elif defined(BT_USE_NEON)
33 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
34 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
35 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
36 #endif
37 
38 #ifdef BT_USE_DOUBLE_PRECISION
39 #define btMatrix3x3Data btMatrix3x3DoubleData
40 #else
41 #define btMatrix3x3Data btMatrix3x3FloatData
42 #endif  //BT_USE_DOUBLE_PRECISION
43 
44 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
45 * Make sure to only include a pure orthogonal matrix without scaling. */
ATTRIBUTE_ALIGNED16(class)46 ATTRIBUTE_ALIGNED16(class)
47 btMatrix3x3
48 {
49 	///Data storage for the matrix, each vector is a row of the matrix
50 	btVector3 m_el[3];
51 
52 public:
53 	/** @brief No initializaion constructor */
54 	btMatrix3x3() {}
55 
56 	//		explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
57 
58 	/**@brief Constructor from Quaternion */
59 	explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
60 	/*
61 	template <typename btScalar>
62 	Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
63 	{
64 	setEulerYPR(yaw, pitch, roll);
65 	}
66 	*/
67 	/** @brief Constructor with row major formatting */
68 	btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
69 				const btScalar& yx, const btScalar& yy, const btScalar& yz,
70 				const btScalar& zx, const btScalar& zy, const btScalar& zz)
71 	{
72 		setValue(xx, xy, xz,
73 				 yx, yy, yz,
74 				 zx, zy, zz);
75 	}
76 
77 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
78 	SIMD_FORCE_INLINE btMatrix3x3(const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2)
79 	{
80 		m_el[0].mVec128 = v0;
81 		m_el[1].mVec128 = v1;
82 		m_el[2].mVec128 = v2;
83 	}
84 
85 	SIMD_FORCE_INLINE btMatrix3x3(const btVector3& v0, const btVector3& v1, const btVector3& v2)
86 	{
87 		m_el[0] = v0;
88 		m_el[1] = v1;
89 		m_el[2] = v2;
90 	}
91 
92 	// Copy constructor
93 	SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
94 	{
95 		m_el[0].mVec128 = rhs.m_el[0].mVec128;
96 		m_el[1].mVec128 = rhs.m_el[1].mVec128;
97 		m_el[2].mVec128 = rhs.m_el[2].mVec128;
98 	}
99 
100 	// Assignment Operator
101 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m)
102 	{
103 		m_el[0].mVec128 = m.m_el[0].mVec128;
104 		m_el[1].mVec128 = m.m_el[1].mVec128;
105 		m_el[2].mVec128 = m.m_el[2].mVec128;
106 
107 		return *this;
108 	}
109 
110 #else
111 
112 	/** @brief Copy constructor */
113 	SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& other)
114 	{
115 		m_el[0] = other.m_el[0];
116 		m_el[1] = other.m_el[1];
117 		m_el[2] = other.m_el[2];
118 	}
119 
120 	/** @brief Assignment Operator */
121 	SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
122 	{
123 		m_el[0] = other.m_el[0];
124 		m_el[1] = other.m_el[1];
125 		m_el[2] = other.m_el[2];
126 		return *this;
127 	}
128 
129 #endif
130 
131 	/** @brief Get a column of the matrix as a vector
132 	*  @param i Column number 0 indexed */
133 	SIMD_FORCE_INLINE btVector3 getColumn(int i) const
134 	{
135 		return btVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
136 	}
137 
138 	/** @brief Get a row of the matrix as a vector
139 	*  @param i Row number 0 indexed */
140 	SIMD_FORCE_INLINE const btVector3& getRow(int i) const
141 	{
142 		btFullAssert(0 <= i && i < 3);
143 		return m_el[i];
144 	}
145 
146 	/** @brief Get a mutable reference to a row of the matrix as a vector
147 	*  @param i Row number 0 indexed */
148 	SIMD_FORCE_INLINE btVector3& operator[](int i)
149 	{
150 		btFullAssert(0 <= i && i < 3);
151 		return m_el[i];
152 	}
153 
154 	/** @brief Get a const reference to a row of the matrix as a vector
155 	*  @param i Row number 0 indexed */
156 	SIMD_FORCE_INLINE const btVector3& operator[](int i) const
157 	{
158 		btFullAssert(0 <= i && i < 3);
159 		return m_el[i];
160 	}
161 
162 	/** @brief Multiply by the target matrix on the right
163 	*  @param m Rotation matrix to be applied
164 	* Equivilant to this = this * m */
165 	btMatrix3x3& operator*=(const btMatrix3x3& m);
166 
167 	/** @brief Adds by the target matrix on the right
168 	*  @param m matrix to be applied
169 	* Equivilant to this = this + m */
170 	btMatrix3x3& operator+=(const btMatrix3x3& m);
171 
172 	/** @brief Substractss by the target matrix on the right
173 	*  @param m matrix to be applied
174 	* Equivilant to this = this - m */
175 	btMatrix3x3& operator-=(const btMatrix3x3& m);
176 
177 	/** @brief Set from the rotational part of a 4x4 OpenGL matrix
178 	*  @param m A pointer to the beginning of the array of scalars*/
179 	void setFromOpenGLSubMatrix(const btScalar* m)
180 	{
181 		m_el[0].setValue(m[0], m[4], m[8]);
182 		m_el[1].setValue(m[1], m[5], m[9]);
183 		m_el[2].setValue(m[2], m[6], m[10]);
184 	}
185 	/** @brief Set the values of the matrix explicitly (row major)
186 	*  @param xx Top left
187 	*  @param xy Top Middle
188 	*  @param xz Top Right
189 	*  @param yx Middle Left
190 	*  @param yy Middle Middle
191 	*  @param yz Middle Right
192 	*  @param zx Bottom Left
193 	*  @param zy Bottom Middle
194 	*  @param zz Bottom Right*/
195 	void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz,
196 				  const btScalar& yx, const btScalar& yy, const btScalar& yz,
197 				  const btScalar& zx, const btScalar& zy, const btScalar& zz)
198 	{
199 		m_el[0].setValue(xx, xy, xz);
200 		m_el[1].setValue(yx, yy, yz);
201 		m_el[2].setValue(zx, zy, zz);
202 	}
203 
204 	/** @brief Set the matrix from a quaternion
205 	*  @param q The Quaternion to match */
206 	void setRotation(const btQuaternion& q)
207 	{
208 		btScalar d = q.length2();
209 		btFullAssert(d != btScalar(0.0));
210 		btScalar s = btScalar(2.0) / d;
211 
212 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
213 		__m128 vs, Q = q.get128();
214 		__m128i Qi = btCastfTo128i(Q);
215 		__m128 Y, Z;
216 		__m128 V1, V2, V3;
217 		__m128 V11, V21, V31;
218 		__m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
219 		__m128i NQi = btCastfTo128i(NQ);
220 
221 		V1 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 2, 3)));  // Y X Z W
222 		V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0, 0, 1, 3));                 // -X -X  Y  W
223 		V3 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(2, 1, 0, 3)));  // Z Y X W
224 		V1 = _mm_xor_ps(V1, vMPPP);                                         //	change the sign of the first element
225 
226 		V11 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 1, 0, 3)));  // Y Y X W
227 		V21 = _mm_unpackhi_ps(Q, Q);                                         //  Z  Z  W  W
228 		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0, 2, 0, 3));                 //  X  Z -X -W
229 
230 		V2 = V2 * V1;   //
231 		V1 = V1 * V11;  //
232 		V3 = V3 * V31;  //
233 
234 		V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2, 3, 1, 3));                //	-Z -W  Y  W
235 		V11 = V11 * V21;                                                    //
236 		V21 = _mm_xor_ps(V21, vMPPP);                                       //	change the sign of the first element
237 		V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3, 3, 1, 3));                //	 W  W -Y -W
238 		V31 = _mm_xor_ps(V31, vMPPP);                                       //	change the sign of the first element
239 		Y = btCastiTo128f(_mm_shuffle_epi32(NQi, BT_SHUFFLE(3, 2, 0, 3)));  // -W -Z -X -W
240 		Z = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 1, 3)));   //  Y  X  Y  W
241 
242 		vs = _mm_load_ss(&s);
243 		V21 = V21 * Y;
244 		V31 = V31 * Z;
245 
246 		V1 = V1 + V11;
247 		V2 = V2 + V21;
248 		V3 = V3 + V31;
249 
250 		vs = bt_splat3_ps(vs, 0);
251 		//	s ready
252 		V1 = V1 * vs;
253 		V2 = V2 * vs;
254 		V3 = V3 * vs;
255 
256 		V1 = V1 + v1000;
257 		V2 = V2 + v0100;
258 		V3 = V3 + v0010;
259 
260 		m_el[0] = V1;
261 		m_el[1] = V2;
262 		m_el[2] = V3;
263 #else
264 		btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s;
265 		btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs;
266 		btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs;
267 		btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs;
268 		setValue(
269 			btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
270 			xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
271 			xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
272 #endif
273 	}
274 
275 	/** @brief Set the matrix from euler angles using YPR around YXZ respectively
276 	*  @param yaw Yaw about Y axis
277 	*  @param pitch Pitch about X axis
278 	*  @param roll Roll about Z axis
279 	*/
280 	void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
281 	{
282 		setEulerZYX(roll, pitch, yaw);
283 	}
284 
285 	/** @brief Set the matrix from euler angles YPR around ZYX axes
286 	* @param eulerX Roll about X axis
287 	* @param eulerY Pitch around Y axis
288 	* @param eulerZ Yaw about Z axis
289 	*
290 	* These angles are used to produce a rotation matrix. The euler
291 	* angles are applied in ZYX order. I.e a vector is first rotated
292 	* about X then Y and then Z
293 	**/
294 	void setEulerZYX(btScalar eulerX, btScalar eulerY, btScalar eulerZ)
295 	{
296 		///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
297 		btScalar ci(btCos(eulerX));
298 		btScalar cj(btCos(eulerY));
299 		btScalar ch(btCos(eulerZ));
300 		btScalar si(btSin(eulerX));
301 		btScalar sj(btSin(eulerY));
302 		btScalar sh(btSin(eulerZ));
303 		btScalar cc = ci * ch;
304 		btScalar cs = ci * sh;
305 		btScalar sc = si * ch;
306 		btScalar ss = si * sh;
307 
308 		setValue(cj * ch, sj * sc - cs, sj * cc + ss,
309 				 cj * sh, sj * ss + cc, sj * cs - sc,
310 				 -sj, cj * si, cj * ci);
311 	}
312 
313 	/**@brief Set the matrix to the identity */
314 	void setIdentity()
315 	{
316 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
317 		m_el[0] = v1000;
318 		m_el[1] = v0100;
319 		m_el[2] = v0010;
320 #else
321 		setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0),
322 				 btScalar(0.0), btScalar(1.0), btScalar(0.0),
323 				 btScalar(0.0), btScalar(0.0), btScalar(1.0));
324 #endif
325 	}
326 
327 	static const btMatrix3x3& getIdentity()
328 	{
329 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
330 		static const btMatrix3x3
331 			identityMatrix(v1000, v0100, v0010);
332 #else
333 		static const btMatrix3x3
334 			identityMatrix(
335 				btScalar(1.0), btScalar(0.0), btScalar(0.0),
336 				btScalar(0.0), btScalar(1.0), btScalar(0.0),
337 				btScalar(0.0), btScalar(0.0), btScalar(1.0));
338 #endif
339 		return identityMatrix;
340 	}
341 
342 	/**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
343 	* @param m The array to be filled */
344 	void getOpenGLSubMatrix(btScalar * m) const
345 	{
346 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
347 		__m128 v0 = m_el[0].mVec128;
348 		__m128 v1 = m_el[1].mVec128;
349 		__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
350 		__m128* vm = (__m128*)m;
351 		__m128 vT;
352 
353 		v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
354 
355 		vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
356 		v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
357 
358 		v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
359 		v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
360 		v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  // z0 z1 z2 0
361 
362 		vm[0] = v0;
363 		vm[1] = v1;
364 		vm[2] = v2;
365 #elif defined(BT_USE_NEON)
366 		// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
367 		static const uint32x2_t zMask = (const uint32x2_t){static_cast<uint32_t>(-1), 0};
368 		float32x4_t* vm = (float32x4_t*)m;
369 		float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
370 		float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
371 		float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
372 		float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
373 		float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
374 		float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
375 
376 		vm[0] = v0;
377 		vm[1] = v1;
378 		vm[2] = v2;
379 #else
380 		m[0] = btScalar(m_el[0].x());
381 		m[1] = btScalar(m_el[1].x());
382 		m[2] = btScalar(m_el[2].x());
383 		m[3] = btScalar(0.0);
384 		m[4] = btScalar(m_el[0].y());
385 		m[5] = btScalar(m_el[1].y());
386 		m[6] = btScalar(m_el[2].y());
387 		m[7] = btScalar(0.0);
388 		m[8] = btScalar(m_el[0].z());
389 		m[9] = btScalar(m_el[1].z());
390 		m[10] = btScalar(m_el[2].z());
391 		m[11] = btScalar(0.0);
392 #endif
393 	}
394 
395 	/**@brief Get the matrix represented as a quaternion
396 	* @param q The quaternion which will be set */
397 	void getRotation(btQuaternion & q) const
398 	{
399 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
400 		btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
401 		btScalar s, x;
402 
403 		union {
404 			btSimdFloat4 vec;
405 			btScalar f[4];
406 		} temp;
407 
408 		if (trace > btScalar(0.0))
409 		{
410 			x = trace + btScalar(1.0);
411 
412 			temp.f[0] = m_el[2].y() - m_el[1].z();
413 			temp.f[1] = m_el[0].z() - m_el[2].x();
414 			temp.f[2] = m_el[1].x() - m_el[0].y();
415 			temp.f[3] = x;
416 			//temp.f[3]= s * btScalar(0.5);
417 		}
418 		else
419 		{
420 			int i, j, k;
421 			if (m_el[0].x() < m_el[1].y())
422 			{
423 				if (m_el[1].y() < m_el[2].z())
424 				{
425 					i = 2;
426 					j = 0;
427 					k = 1;
428 				}
429 				else
430 				{
431 					i = 1;
432 					j = 2;
433 					k = 0;
434 				}
435 			}
436 			else
437 			{
438 				if (m_el[0].x() < m_el[2].z())
439 				{
440 					i = 2;
441 					j = 0;
442 					k = 1;
443 				}
444 				else
445 				{
446 					i = 0;
447 					j = 1;
448 					k = 2;
449 				}
450 			}
451 
452 			x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
453 
454 			temp.f[3] = (m_el[k][j] - m_el[j][k]);
455 			temp.f[j] = (m_el[j][i] + m_el[i][j]);
456 			temp.f[k] = (m_el[k][i] + m_el[i][k]);
457 			temp.f[i] = x;
458 			//temp.f[i] = s * btScalar(0.5);
459 		}
460 
461 		s = btSqrt(x);
462 		q.set128(temp.vec);
463 		s = btScalar(0.5) / s;
464 
465 		q *= s;
466 #else
467 		btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
468 
469 		btScalar temp[4];
470 
471 		if (trace > btScalar(0.0))
472 		{
473 			btScalar s = btSqrt(trace + btScalar(1.0));
474 			temp[3] = (s * btScalar(0.5));
475 			s = btScalar(0.5) / s;
476 
477 			temp[0] = ((m_el[2].y() - m_el[1].z()) * s);
478 			temp[1] = ((m_el[0].z() - m_el[2].x()) * s);
479 			temp[2] = ((m_el[1].x() - m_el[0].y()) * s);
480 		}
481 		else
482 		{
483 			int i = m_el[0].x() < m_el[1].y() ? (m_el[1].y() < m_el[2].z() ? 2 : 1) : (m_el[0].x() < m_el[2].z() ? 2 : 0);
484 			int j = (i + 1) % 3;
485 			int k = (i + 2) % 3;
486 
487 			btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
488 			temp[i] = s * btScalar(0.5);
489 			s = btScalar(0.5) / s;
490 
491 			temp[3] = (m_el[k][j] - m_el[j][k]) * s;
492 			temp[j] = (m_el[j][i] + m_el[i][j]) * s;
493 			temp[k] = (m_el[k][i] + m_el[i][k]) * s;
494 		}
495 		q.setValue(temp[0], temp[1], temp[2], temp[3]);
496 #endif
497 	}
498 
499 	/**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
500 	* @param yaw Yaw around Y axis
501 	* @param pitch Pitch around X axis
502 	* @param roll around Z axis */
503 	void getEulerYPR(btScalar & yaw, btScalar & pitch, btScalar & roll) const
504 	{
505 		// first use the normal calculus
506 		yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
507 		pitch = btScalar(btAsin(-m_el[2].x()));
508 		roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
509 
510 		// on pitch = +/-HalfPI
511 		if (btFabs(pitch) == SIMD_HALF_PI)
512 		{
513 			if (yaw > 0)
514 				yaw -= SIMD_PI;
515 			else
516 				yaw += SIMD_PI;
517 
518 			if (roll > 0)
519 				roll -= SIMD_PI;
520 			else
521 				roll += SIMD_PI;
522 		}
523 	};
524 
525 	/**@brief Get the matrix represented as euler angles around ZYX
526 	* @param yaw Yaw around Z axis
527 	* @param pitch Pitch around Y axis
528 	* @param roll around X axis
529 	* @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
530 	void getEulerZYX(btScalar & yaw, btScalar & pitch, btScalar & roll, unsigned int solution_number = 1) const
531 	{
532 		struct Euler
533 		{
534 			btScalar yaw;
535 			btScalar pitch;
536 			btScalar roll;
537 		};
538 
539 		Euler euler_out;
540 		Euler euler_out2;  //second solution
541 		//get the pointer to the raw data
542 
543 		// Check that pitch is not at a singularity
544 		if (btFabs(m_el[2].x()) >= 1)
545 		{
546 			euler_out.yaw = 0;
547 			euler_out2.yaw = 0;
548 
549 			// From difference of angles formula
550 			btScalar delta = btAtan2(m_el[0].x(), m_el[0].z());
551 			if (m_el[2].x() > 0)  //gimbal locked up
552 			{
553 				euler_out.pitch = SIMD_PI / btScalar(2.0);
554 				euler_out2.pitch = SIMD_PI / btScalar(2.0);
555 				euler_out.roll = euler_out.pitch + delta;
556 				euler_out2.roll = euler_out.pitch + delta;
557 			}
558 			else  // gimbal locked down
559 			{
560 				euler_out.pitch = -SIMD_PI / btScalar(2.0);
561 				euler_out2.pitch = -SIMD_PI / btScalar(2.0);
562 				euler_out.roll = -euler_out.pitch + delta;
563 				euler_out2.roll = -euler_out.pitch + delta;
564 			}
565 		}
566 		else
567 		{
568 			euler_out.pitch = -btAsin(m_el[2].x());
569 			euler_out2.pitch = SIMD_PI - euler_out.pitch;
570 
571 			euler_out.roll = btAtan2(m_el[2].y() / btCos(euler_out.pitch),
572 									 m_el[2].z() / btCos(euler_out.pitch));
573 			euler_out2.roll = btAtan2(m_el[2].y() / btCos(euler_out2.pitch),
574 									  m_el[2].z() / btCos(euler_out2.pitch));
575 
576 			euler_out.yaw = btAtan2(m_el[1].x() / btCos(euler_out.pitch),
577 									m_el[0].x() / btCos(euler_out.pitch));
578 			euler_out2.yaw = btAtan2(m_el[1].x() / btCos(euler_out2.pitch),
579 									 m_el[0].x() / btCos(euler_out2.pitch));
580 		}
581 
582 		if (solution_number == 1)
583 		{
584 			yaw = euler_out.yaw;
585 			pitch = euler_out.pitch;
586 			roll = euler_out.roll;
587 		}
588 		else
589 		{
590 			yaw = euler_out2.yaw;
591 			pitch = euler_out2.pitch;
592 			roll = euler_out2.roll;
593 		}
594 	}
595 
596 	/**@brief Create a scaled copy of the matrix
597 	* @param s Scaling vector The elements of the vector will scale each column */
598 
599 	btMatrix3x3 scaled(const btVector3& s) const
600 	{
601 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
602 		return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
603 #else
604 		return btMatrix3x3(
605 			m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
606 			m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
607 			m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
608 #endif
609 	}
610 
611 	/**@brief Return the determinant of the matrix */
612 	btScalar determinant() const;
613 	/**@brief Return the adjoint of the matrix */
614 	btMatrix3x3 adjoint() const;
615 	/**@brief Return the matrix with all values non negative */
616 	btMatrix3x3 absolute() const;
617 	/**@brief Return the transpose of the matrix */
618 	btMatrix3x3 transpose() const;
619 	/**@brief Return the inverse of the matrix */
620 	btMatrix3x3 inverse() const;
621 
622 	/// Solve A * x = b, where b is a column vector. This is more efficient
623 	/// than computing the inverse in one-shot cases.
624 	///Solve33 is from Box2d, thanks to Erin Catto,
625 	btVector3 solve33(const btVector3& b) const
626 	{
627 		btVector3 col1 = getColumn(0);
628 		btVector3 col2 = getColumn(1);
629 		btVector3 col3 = getColumn(2);
630 
631 		btScalar det = btDot(col1, btCross(col2, col3));
632 		if (btFabs(det) > SIMD_EPSILON)
633 		{
634 			det = 1.0f / det;
635 		}
636 		btVector3 x;
637 		x[0] = det * btDot(b, btCross(col2, col3));
638 		x[1] = det * btDot(col1, btCross(b, col3));
639 		x[2] = det * btDot(col1, btCross(col2, b));
640 		return x;
641 	}
642 
643 	btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
644 	btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
645 
646 	SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const
647 	{
648 		return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
649 	}
650 	SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const
651 	{
652 		return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
653 	}
654 	SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const
655 	{
656 		return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
657 	}
658 
659 	///extractRotation is from "A robust method to extract the rotational part of deformations"
660 	///See http://dl.acm.org/citation.cfm?doid=2994258.2994269
661 	///decomposes a matrix A in a orthogonal matrix R and a
662 	///symmetric matrix S:
663 	///A = R*S.
664 	///note that R can include both rotation and scaling.
665 	SIMD_FORCE_INLINE void extractRotation(btQuaternion & q, btScalar tolerance = 1.0e-9, int maxIter = 100)
666 	{
667 		int iter = 0;
668 		btScalar w;
669 		const btMatrix3x3& A = *this;
670 		for (iter = 0; iter < maxIter; iter++)
671 		{
672 			btMatrix3x3 R(q);
673 			btVector3 omega = (R.getColumn(0).cross(A.getColumn(0)) + R.getColumn(1).cross(A.getColumn(1)) + R.getColumn(2).cross(A.getColumn(2))) * (btScalar(1.0) / btFabs(R.getColumn(0).dot(A.getColumn(0)) + R.getColumn(1).dot(A.getColumn(1)) + R.getColumn(2).dot(A.getColumn(2))) +
674 																																					  tolerance);
675 			w = omega.norm();
676 			if (w < tolerance)
677 				break;
678 			q = btQuaternion(btVector3((btScalar(1.0) / w) * omega), w) *
679 				q;
680 			q.normalize();
681 		}
682 	}
683 
684 	/**@brief diagonalizes this matrix by the Jacobi method.
685 	* @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
686 	* coordinate system, i.e., old_this = rot * new_this * rot^T.
687 	* @param threshold See iteration
688 	* @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
689 	* by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
690 	*
691 	* Note that this matrix is assumed to be symmetric.
692 	*/
693 	void diagonalize(btMatrix3x3 & rot, btScalar threshold, int maxSteps)
694 	{
695 		rot.setIdentity();
696 		for (int step = maxSteps; step > 0; step--)
697 		{
698 			// find off-diagonal element [p][q] with largest magnitude
699 			int p = 0;
700 			int q = 1;
701 			int r = 2;
702 			btScalar max = btFabs(m_el[0][1]);
703 			btScalar v = btFabs(m_el[0][2]);
704 			if (v > max)
705 			{
706 				q = 2;
707 				r = 1;
708 				max = v;
709 			}
710 			v = btFabs(m_el[1][2]);
711 			if (v > max)
712 			{
713 				p = 1;
714 				q = 2;
715 				r = 0;
716 				max = v;
717 			}
718 
719 			btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
720 			if (max <= t)
721 			{
722 				if (max <= SIMD_EPSILON * t)
723 				{
724 					return;
725 				}
726 				step = 1;
727 			}
728 
729 			// compute Jacobi rotation J which leads to a zero for element [p][q]
730 			btScalar mpq = m_el[p][q];
731 			btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
732 			btScalar theta2 = theta * theta;
733 			btScalar cos;
734 			btScalar sin;
735 			if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
736 			{
737 				t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
738 								 : 1 / (theta - btSqrt(1 + theta2));
739 				cos = 1 / btSqrt(1 + t * t);
740 				sin = cos * t;
741 			}
742 			else
743 			{
744 				// approximation for large theta-value, i.e., a nearly diagonal matrix
745 				t = 1 / (theta * (2 + btScalar(0.5) / theta2));
746 				cos = 1 - btScalar(0.5) * t * t;
747 				sin = cos * t;
748 			}
749 
750 			// apply rotation to matrix (this = J^T * this * J)
751 			m_el[p][q] = m_el[q][p] = 0;
752 			m_el[p][p] -= t * mpq;
753 			m_el[q][q] += t * mpq;
754 			btScalar mrp = m_el[r][p];
755 			btScalar mrq = m_el[r][q];
756 			m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
757 			m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
758 
759 			// apply rotation to rot (rot = rot * J)
760 			for (int i = 0; i < 3; i++)
761 			{
762 				btVector3& row = rot[i];
763 				mrp = row[p];
764 				mrq = row[q];
765 				row[p] = cos * mrp - sin * mrq;
766 				row[q] = cos * mrq + sin * mrp;
767 			}
768 		}
769 	}
770 
771 	/**@brief Calculate the matrix cofactor
772 	* @param r1 The first row to use for calculating the cofactor
773 	* @param c1 The first column to use for calculating the cofactor
774 	* @param r1 The second row to use for calculating the cofactor
775 	* @param c1 The second column to use for calculating the cofactor
776 	* See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
777 	*/
778 	btScalar cofac(int r1, int c1, int r2, int c2) const
779 	{
780 		return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
781 	}
782 
783 	void serialize(struct btMatrix3x3Data & dataOut) const;
784 
785 	void serializeFloat(struct btMatrix3x3FloatData & dataOut) const;
786 
787 	void deSerialize(const struct btMatrix3x3Data& dataIn);
788 
789 	void deSerializeFloat(const struct btMatrix3x3FloatData& dataIn);
790 
791 	void deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn);
792 };
793 
794 SIMD_FORCE_INLINE btMatrix3x3&
795 btMatrix3x3::operator*=(const btMatrix3x3& m)
796 {
797 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
798 	__m128 rv00, rv01, rv02;
799 	__m128 rv10, rv11, rv12;
800 	__m128 rv20, rv21, rv22;
801 	__m128 mv0, mv1, mv2;
802 
803 	rv02 = m_el[0].mVec128;
804 	rv12 = m_el[1].mVec128;
805 	rv22 = m_el[2].mVec128;
806 
807 	mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
808 	mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
809 	mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
810 
811 	// rv0
812 	rv00 = bt_splat_ps(rv02, 0);
813 	rv01 = bt_splat_ps(rv02, 1);
814 	rv02 = bt_splat_ps(rv02, 2);
815 
816 	rv00 = _mm_mul_ps(rv00, mv0);
817 	rv01 = _mm_mul_ps(rv01, mv1);
818 	rv02 = _mm_mul_ps(rv02, mv2);
819 
820 	// rv1
821 	rv10 = bt_splat_ps(rv12, 0);
822 	rv11 = bt_splat_ps(rv12, 1);
823 	rv12 = bt_splat_ps(rv12, 2);
824 
825 	rv10 = _mm_mul_ps(rv10, mv0);
826 	rv11 = _mm_mul_ps(rv11, mv1);
827 	rv12 = _mm_mul_ps(rv12, mv2);
828 
829 	// rv2
830 	rv20 = bt_splat_ps(rv22, 0);
831 	rv21 = bt_splat_ps(rv22, 1);
832 	rv22 = bt_splat_ps(rv22, 2);
833 
834 	rv20 = _mm_mul_ps(rv20, mv0);
835 	rv21 = _mm_mul_ps(rv21, mv1);
836 	rv22 = _mm_mul_ps(rv22, mv2);
837 
838 	rv00 = _mm_add_ps(rv00, rv01);
839 	rv10 = _mm_add_ps(rv10, rv11);
840 	rv20 = _mm_add_ps(rv20, rv21);
841 
842 	m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
843 	m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
844 	m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
845 
846 #elif defined(BT_USE_NEON)
847 
848 	float32x4_t rv0, rv1, rv2;
849 	float32x4_t v0, v1, v2;
850 	float32x4_t mv0, mv1, mv2;
851 
852 	v0 = m_el[0].mVec128;
853 	v1 = m_el[1].mVec128;
854 	v2 = m_el[2].mVec128;
855 
856 	mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
857 	mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
858 	mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
859 
860 	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
861 	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
862 	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
863 
864 	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
865 	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
866 	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
867 
868 	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
869 	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
870 	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
871 
872 	m_el[0].mVec128 = rv0;
873 	m_el[1].mVec128 = rv1;
874 	m_el[2].mVec128 = rv2;
875 #else
876 	setValue(
877 		m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
878 		m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
879 		m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
880 #endif
881 	return *this;
882 }
883 
884 SIMD_FORCE_INLINE btMatrix3x3&
885 btMatrix3x3::operator+=(const btMatrix3x3& m)
886 {
887 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
888 	m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
889 	m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
890 	m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
891 #else
892 	setValue(
893 		m_el[0][0] + m.m_el[0][0],
894 		m_el[0][1] + m.m_el[0][1],
895 		m_el[0][2] + m.m_el[0][2],
896 		m_el[1][0] + m.m_el[1][0],
897 		m_el[1][1] + m.m_el[1][1],
898 		m_el[1][2] + m.m_el[1][2],
899 		m_el[2][0] + m.m_el[2][0],
900 		m_el[2][1] + m.m_el[2][1],
901 		m_el[2][2] + m.m_el[2][2]);
902 #endif
903 	return *this;
904 }
905 
906 SIMD_FORCE_INLINE btMatrix3x3
907 operator*(const btMatrix3x3& m, const btScalar& k)
908 {
909 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
910 	__m128 vk = bt_splat_ps(_mm_load_ss((float*)&k), 0x80);
911 	return btMatrix3x3(
912 		_mm_mul_ps(m[0].mVec128, vk),
913 		_mm_mul_ps(m[1].mVec128, vk),
914 		_mm_mul_ps(m[2].mVec128, vk));
915 #elif defined(BT_USE_NEON)
916 	return btMatrix3x3(
917 		vmulq_n_f32(m[0].mVec128, k),
918 		vmulq_n_f32(m[1].mVec128, k),
919 		vmulq_n_f32(m[2].mVec128, k));
920 #else
921 	return btMatrix3x3(
922 		m[0].x() * k, m[0].y() * k, m[0].z() * k,
923 		m[1].x() * k, m[1].y() * k, m[1].z() * k,
924 		m[2].x() * k, m[2].y() * k, m[2].z() * k);
925 #endif
926 }
927 
928 SIMD_FORCE_INLINE btMatrix3x3
929 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
930 {
931 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
932 	return btMatrix3x3(
933 		m1[0].mVec128 + m2[0].mVec128,
934 		m1[1].mVec128 + m2[1].mVec128,
935 		m1[2].mVec128 + m2[2].mVec128);
936 #else
937 	return btMatrix3x3(
938 		m1[0][0] + m2[0][0],
939 		m1[0][1] + m2[0][1],
940 		m1[0][2] + m2[0][2],
941 
942 		m1[1][0] + m2[1][0],
943 		m1[1][1] + m2[1][1],
944 		m1[1][2] + m2[1][2],
945 
946 		m1[2][0] + m2[2][0],
947 		m1[2][1] + m2[2][1],
948 		m1[2][2] + m2[2][2]);
949 #endif
950 }
951 
952 SIMD_FORCE_INLINE btMatrix3x3
953 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
954 {
955 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
956 	return btMatrix3x3(
957 		m1[0].mVec128 - m2[0].mVec128,
958 		m1[1].mVec128 - m2[1].mVec128,
959 		m1[2].mVec128 - m2[2].mVec128);
960 #else
961 	return btMatrix3x3(
962 		m1[0][0] - m2[0][0],
963 		m1[0][1] - m2[0][1],
964 		m1[0][2] - m2[0][2],
965 
966 		m1[1][0] - m2[1][0],
967 		m1[1][1] - m2[1][1],
968 		m1[1][2] - m2[1][2],
969 
970 		m1[2][0] - m2[2][0],
971 		m1[2][1] - m2[2][1],
972 		m1[2][2] - m2[2][2]);
973 #endif
974 }
975 
976 SIMD_FORCE_INLINE btMatrix3x3&
977 btMatrix3x3::operator-=(const btMatrix3x3& m)
978 {
979 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
980 	m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
981 	m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
982 	m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
983 #else
984 	setValue(
985 		m_el[0][0] - m.m_el[0][0],
986 		m_el[0][1] - m.m_el[0][1],
987 		m_el[0][2] - m.m_el[0][2],
988 		m_el[1][0] - m.m_el[1][0],
989 		m_el[1][1] - m.m_el[1][1],
990 		m_el[1][2] - m.m_el[1][2],
991 		m_el[2][0] - m.m_el[2][0],
992 		m_el[2][1] - m.m_el[2][1],
993 		m_el[2][2] - m.m_el[2][2]);
994 #endif
995 	return *this;
996 }
997 
998 SIMD_FORCE_INLINE btScalar
determinant()999 btMatrix3x3::determinant() const
1000 {
1001 	return btTriple((*this)[0], (*this)[1], (*this)[2]);
1002 }
1003 
1004 SIMD_FORCE_INLINE btMatrix3x3
absolute()1005 btMatrix3x3::absolute() const
1006 {
1007 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1008 	return btMatrix3x3(
1009 		_mm_and_ps(m_el[0].mVec128, btvAbsfMask),
1010 		_mm_and_ps(m_el[1].mVec128, btvAbsfMask),
1011 		_mm_and_ps(m_el[2].mVec128, btvAbsfMask));
1012 #elif defined(BT_USE_NEON)
1013 	return btMatrix3x3(
1014 		(float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
1015 		(float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
1016 		(float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
1017 #else
1018 	return btMatrix3x3(
1019 		btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
1020 		btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
1021 		btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
1022 #endif
1023 }
1024 
1025 SIMD_FORCE_INLINE btMatrix3x3
transpose()1026 btMatrix3x3::transpose() const
1027 {
1028 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1029 	__m128 v0 = m_el[0].mVec128;
1030 	__m128 v1 = m_el[1].mVec128;
1031 	__m128 v2 = m_el[2].mVec128;  //  x2 y2 z2 w2
1032 	__m128 vT;
1033 
1034 	v2 = _mm_and_ps(v2, btvFFF0fMask);  //  x2 y2 z2 0
1035 
1036 	vT = _mm_unpackhi_ps(v0, v1);  //	z0 z1 * *
1037 	v0 = _mm_unpacklo_ps(v0, v1);  //	x0 x1 y0 y1
1038 
1039 	v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3));                    // y0 y1 y2 0
1040 	v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3));                    // x0 x1 x2 0
1041 	v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT)));  // z0 z1 z2 0
1042 
1043 	return btMatrix3x3(v0, v1, v2);
1044 #elif defined(BT_USE_NEON)
1045 	// note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
1046 	static const uint32x2_t zMask = (const uint32x2_t){static_cast<uint32_t>(-1), 0};
1047 	float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128);               // {x0 x1 z0 z1}, {y0 y1 w0 w1}
1048 	float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f));  // {x2  0 }, {y2 0}
1049 	float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
1050 	float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
1051 	float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
1052 	float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q);  // z0 z1 z2  0
1053 	return btMatrix3x3(v0, v1, v2);
1054 #else
1055 	return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(),
1056 					   m_el[0].y(), m_el[1].y(), m_el[2].y(),
1057 					   m_el[0].z(), m_el[1].z(), m_el[2].z());
1058 #endif
1059 }
1060 
1061 SIMD_FORCE_INLINE btMatrix3x3
adjoint()1062 btMatrix3x3::adjoint() const
1063 {
1064 	return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
1065 					   cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
1066 					   cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
1067 }
1068 
1069 SIMD_FORCE_INLINE btMatrix3x3
inverse()1070 btMatrix3x3::inverse() const
1071 {
1072 	btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
1073 	btScalar det = (*this)[0].dot(co);
1074 	//btFullAssert(det != btScalar(0.0));
1075 	btAssert(det != btScalar(0.0));
1076 	btScalar s = btScalar(1.0) / det;
1077 	return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
1078 					   co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
1079 					   co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
1080 }
1081 
1082 SIMD_FORCE_INLINE btMatrix3x3
transposeTimes(const btMatrix3x3 & m)1083 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
1084 {
1085 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1086 	// zeros w
1087 	//    static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1088 	__m128 row = m_el[0].mVec128;
1089 	__m128 m0 = _mm_and_ps(m.getRow(0).mVec128, btvFFF0fMask);
1090 	__m128 m1 = _mm_and_ps(m.getRow(1).mVec128, btvFFF0fMask);
1091 	__m128 m2 = _mm_and_ps(m.getRow(2).mVec128, btvFFF0fMask);
1092 	__m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1093 	__m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1094 	__m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1095 	row = m_el[1].mVec128;
1096 	r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1097 	r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1098 	r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1099 	row = m_el[2].mVec128;
1100 	r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1101 	r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1102 	r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1103 	return btMatrix3x3(r0, r1, r2);
1104 
1105 #elif defined BT_USE_NEON
1106 	// zeros w
1107 	static const uint32x4_t xyzMask = (const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
1108 	float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask);
1109 	float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask);
1110 	float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask);
1111 	float32x4_t row = m_el[0].mVec128;
1112 	float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
1113 	float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
1114 	float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
1115 	row = m_el[1].mVec128;
1116 	r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
1117 	r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
1118 	r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
1119 	row = m_el[2].mVec128;
1120 	r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
1121 	r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
1122 	r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
1123 	return btMatrix3x3(r0, r1, r2);
1124 #else
1125 	return btMatrix3x3(
1126 		m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
1127 		m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
1128 		m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
1129 		m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
1130 		m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
1131 		m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
1132 		m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
1133 		m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
1134 		m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
1135 #endif
1136 }
1137 
1138 SIMD_FORCE_INLINE btMatrix3x3
timesTranspose(const btMatrix3x3 & m)1139 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
1140 {
1141 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1142 	__m128 a0 = m_el[0].mVec128;
1143 	__m128 a1 = m_el[1].mVec128;
1144 	__m128 a2 = m_el[2].mVec128;
1145 
1146 	btMatrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
1147 	__m128 mx = mT[0].mVec128;
1148 	__m128 my = mT[1].mVec128;
1149 	__m128 mz = mT[2].mVec128;
1150 
1151 	__m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1152 	__m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1153 	__m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1154 	r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1155 	r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1156 	r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1157 	r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1158 	r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1159 	r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1160 	return btMatrix3x3(r0, r1, r2);
1161 
1162 #elif defined BT_USE_NEON
1163 	float32x4_t a0 = m_el[0].mVec128;
1164 	float32x4_t a1 = m_el[1].mVec128;
1165 	float32x4_t a2 = m_el[2].mVec128;
1166 
1167 	btMatrix3x3 mT = m.transpose();  // we rely on transpose() zeroing w channel so that we don't have to do it here
1168 	float32x4_t mx = mT[0].mVec128;
1169 	float32x4_t my = mT[1].mVec128;
1170 	float32x4_t mz = mT[2].mVec128;
1171 
1172 	float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
1173 	float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
1174 	float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
1175 	r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
1176 	r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
1177 	r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
1178 	r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
1179 	r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
1180 	r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
1181 	return btMatrix3x3(r0, r1, r2);
1182 
1183 #else
1184 	return btMatrix3x3(
1185 		m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
1186 		m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
1187 		m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
1188 #endif
1189 }
1190 
1191 SIMD_FORCE_INLINE btVector3
1192 operator*(const btMatrix3x3& m, const btVector3& v)
1193 {
1194 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1195 	return v.dot3(m[0], m[1], m[2]);
1196 #else
1197 	return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
1198 #endif
1199 }
1200 
1201 SIMD_FORCE_INLINE btVector3
1202 operator*(const btVector3& v, const btMatrix3x3& m)
1203 {
1204 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1205 
1206 	const __m128 vv = v.mVec128;
1207 
1208 	__m128 c0 = bt_splat_ps(vv, 0);
1209 	__m128 c1 = bt_splat_ps(vv, 1);
1210 	__m128 c2 = bt_splat_ps(vv, 2);
1211 
1212 	c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask));
1213 	c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask));
1214 	c0 = _mm_add_ps(c0, c1);
1215 	c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask));
1216 
1217 	return btVector3(_mm_add_ps(c0, c2));
1218 #elif defined(BT_USE_NEON)
1219 	const float32x4_t vv = v.mVec128;
1220 	const float32x2_t vlo = vget_low_f32(vv);
1221 	const float32x2_t vhi = vget_high_f32(vv);
1222 
1223 	float32x4_t c0, c1, c2;
1224 
1225 	c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1226 	c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1227 	c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1228 
1229 	c0 = vmulq_lane_f32(c0, vlo, 0);
1230 	c1 = vmulq_lane_f32(c1, vlo, 1);
1231 	c2 = vmulq_lane_f32(c2, vhi, 0);
1232 	c0 = vaddq_f32(c0, c1);
1233 	c0 = vaddq_f32(c0, c2);
1234 
1235 	return btVector3(c0);
1236 #else
1237 	return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
1238 #endif
1239 }
1240 
1241 SIMD_FORCE_INLINE btMatrix3x3
1242 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
1243 {
1244 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1245 
1246 	__m128 m10 = m1[0].mVec128;
1247 	__m128 m11 = m1[1].mVec128;
1248 	__m128 m12 = m1[2].mVec128;
1249 
1250 	__m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1251 
1252 	__m128 c0 = bt_splat_ps(m10, 0);
1253 	__m128 c1 = bt_splat_ps(m11, 0);
1254 	__m128 c2 = bt_splat_ps(m12, 0);
1255 
1256 	c0 = _mm_mul_ps(c0, m2v);
1257 	c1 = _mm_mul_ps(c1, m2v);
1258 	c2 = _mm_mul_ps(c2, m2v);
1259 
1260 	m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1261 
1262 	__m128 c0_1 = bt_splat_ps(m10, 1);
1263 	__m128 c1_1 = bt_splat_ps(m11, 1);
1264 	__m128 c2_1 = bt_splat_ps(m12, 1);
1265 
1266 	c0_1 = _mm_mul_ps(c0_1, m2v);
1267 	c1_1 = _mm_mul_ps(c1_1, m2v);
1268 	c2_1 = _mm_mul_ps(c2_1, m2v);
1269 
1270 	m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1271 
1272 	c0 = _mm_add_ps(c0, c0_1);
1273 	c1 = _mm_add_ps(c1, c1_1);
1274 	c2 = _mm_add_ps(c2, c2_1);
1275 
1276 	m10 = bt_splat_ps(m10, 2);
1277 	m11 = bt_splat_ps(m11, 2);
1278 	m12 = bt_splat_ps(m12, 2);
1279 
1280 	m10 = _mm_mul_ps(m10, m2v);
1281 	m11 = _mm_mul_ps(m11, m2v);
1282 	m12 = _mm_mul_ps(m12, m2v);
1283 
1284 	c0 = _mm_add_ps(c0, m10);
1285 	c1 = _mm_add_ps(c1, m11);
1286 	c2 = _mm_add_ps(c2, m12);
1287 
1288 	return btMatrix3x3(c0, c1, c2);
1289 
1290 #elif defined(BT_USE_NEON)
1291 
1292 	float32x4_t rv0, rv1, rv2;
1293 	float32x4_t v0, v1, v2;
1294 	float32x4_t mv0, mv1, mv2;
1295 
1296 	v0 = m1[0].mVec128;
1297 	v1 = m1[1].mVec128;
1298 	v2 = m1[2].mVec128;
1299 
1300 	mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1301 	mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1302 	mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1303 
1304 	rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1305 	rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1306 	rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1307 
1308 	rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1309 	rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1310 	rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1311 
1312 	rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1313 	rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1314 	rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1315 
1316 	return btMatrix3x3(rv0, rv1, rv2);
1317 
1318 #else
1319 	return btMatrix3x3(
1320 		m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
1321 		m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
1322 		m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
1323 #endif
1324 }
1325 
1326 /*
1327 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
1328 return btMatrix3x3(
1329 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
1330 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
1331 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
1332 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
1333 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
1334 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
1335 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
1336 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
1337 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
1338 }
1339 */
1340 
1341 /**@brief Equality operator between two matrices
1342 * It will test all elements are equal.  */
1343 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
1344 {
1345 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1346 
1347 	__m128 c0, c1, c2;
1348 
1349 	c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1350 	c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1351 	c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1352 
1353 	c0 = _mm_and_ps(c0, c1);
1354 	c0 = _mm_and_ps(c0, c2);
1355 
1356 	int m = _mm_movemask_ps((__m128)c0);
1357 	return (0x7 == (m & 0x7));
1358 
1359 #else
1360 	return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1361 			m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1362 			m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
1363 #endif
1364 }
1365 
1366 ///for serialization
1367 struct btMatrix3x3FloatData
1368 {
1369 	btVector3FloatData m_el[3];
1370 };
1371 
1372 ///for serialization
1373 struct btMatrix3x3DoubleData
1374 {
1375 	btVector3DoubleData m_el[3];
1376 };
1377 
serialize(struct btMatrix3x3Data & dataOut)1378 SIMD_FORCE_INLINE void btMatrix3x3::serialize(struct btMatrix3x3Data& dataOut) const
1379 {
1380 	for (int i = 0; i < 3; i++)
1381 		m_el[i].serialize(dataOut.m_el[i]);
1382 }
1383 
serializeFloat(struct btMatrix3x3FloatData & dataOut)1384 SIMD_FORCE_INLINE void btMatrix3x3::serializeFloat(struct btMatrix3x3FloatData& dataOut) const
1385 {
1386 	for (int i = 0; i < 3; i++)
1387 		m_el[i].serializeFloat(dataOut.m_el[i]);
1388 }
1389 
deSerialize(const struct btMatrix3x3Data & dataIn)1390 SIMD_FORCE_INLINE void btMatrix3x3::deSerialize(const struct btMatrix3x3Data& dataIn)
1391 {
1392 	for (int i = 0; i < 3; i++)
1393 		m_el[i].deSerialize(dataIn.m_el[i]);
1394 }
1395 
deSerializeFloat(const struct btMatrix3x3FloatData & dataIn)1396 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeFloat(const struct btMatrix3x3FloatData& dataIn)
1397 {
1398 	for (int i = 0; i < 3; i++)
1399 		m_el[i].deSerializeFloat(dataIn.m_el[i]);
1400 }
1401 
deSerializeDouble(const struct btMatrix3x3DoubleData & dataIn)1402 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn)
1403 {
1404 	for (int i = 0; i < 3; i++)
1405 		m_el[i].deSerializeDouble(dataIn.m_el[i]);
1406 }
1407 
1408 #endif  //BT_MATRIX3x3_H
1409