1 /*
2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
3
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14
15 #ifndef BT_MATRIX3x3_H
16 #define BT_MATRIX3x3_H
17
18 #include "btVector3.h"
19 #include "btQuaternion.h"
20 #include <stdio.h>
21
22 #ifdef BT_USE_SSE
23 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
24 //const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
25 #define vMPPP (_mm_set_ps(+0.0f, +0.0f, +0.0f, -0.0f))
26 #endif
27
28 #if defined(BT_USE_SSE)
29 #define v1000 (_mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f))
30 #define v0100 (_mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f))
31 #define v0010 (_mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f))
32 #elif defined(BT_USE_NEON)
33 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
34 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
35 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
36 #endif
37
38 #ifdef BT_USE_DOUBLE_PRECISION
39 #define btMatrix3x3Data btMatrix3x3DoubleData
40 #else
41 #define btMatrix3x3Data btMatrix3x3FloatData
42 #endif //BT_USE_DOUBLE_PRECISION
43
44 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
45 * Make sure to only include a pure orthogonal matrix without scaling. */
ATTRIBUTE_ALIGNED16(class)46 ATTRIBUTE_ALIGNED16(class)
47 btMatrix3x3
48 {
49 ///Data storage for the matrix, each vector is a row of the matrix
50 btVector3 m_el[3];
51
52 public:
53 /** @brief No initializaion constructor */
54 btMatrix3x3() {}
55
56 // explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
57
58 /**@brief Constructor from Quaternion */
59 explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
60 /*
61 template <typename btScalar>
62 Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
63 {
64 setEulerYPR(yaw, pitch, roll);
65 }
66 */
67 /** @brief Constructor with row major formatting */
68 btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
69 const btScalar& yx, const btScalar& yy, const btScalar& yz,
70 const btScalar& zx, const btScalar& zy, const btScalar& zz)
71 {
72 setValue(xx, xy, xz,
73 yx, yy, yz,
74 zx, zy, zz);
75 }
76
77 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
78 SIMD_FORCE_INLINE btMatrix3x3(const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2)
79 {
80 m_el[0].mVec128 = v0;
81 m_el[1].mVec128 = v1;
82 m_el[2].mVec128 = v2;
83 }
84
85 SIMD_FORCE_INLINE btMatrix3x3(const btVector3& v0, const btVector3& v1, const btVector3& v2)
86 {
87 m_el[0] = v0;
88 m_el[1] = v1;
89 m_el[2] = v2;
90 }
91
92 // Copy constructor
93 SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
94 {
95 m_el[0].mVec128 = rhs.m_el[0].mVec128;
96 m_el[1].mVec128 = rhs.m_el[1].mVec128;
97 m_el[2].mVec128 = rhs.m_el[2].mVec128;
98 }
99
100 // Assignment Operator
101 SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m)
102 {
103 m_el[0].mVec128 = m.m_el[0].mVec128;
104 m_el[1].mVec128 = m.m_el[1].mVec128;
105 m_el[2].mVec128 = m.m_el[2].mVec128;
106
107 return *this;
108 }
109
110 #else
111
112 /** @brief Copy constructor */
113 SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& other)
114 {
115 m_el[0] = other.m_el[0];
116 m_el[1] = other.m_el[1];
117 m_el[2] = other.m_el[2];
118 }
119
120 /** @brief Assignment Operator */
121 SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
122 {
123 m_el[0] = other.m_el[0];
124 m_el[1] = other.m_el[1];
125 m_el[2] = other.m_el[2];
126 return *this;
127 }
128
129 #endif
130
131 /** @brief Get a column of the matrix as a vector
132 * @param i Column number 0 indexed */
133 SIMD_FORCE_INLINE btVector3 getColumn(int i) const
134 {
135 return btVector3(m_el[0][i], m_el[1][i], m_el[2][i]);
136 }
137
138 /** @brief Get a row of the matrix as a vector
139 * @param i Row number 0 indexed */
140 SIMD_FORCE_INLINE const btVector3& getRow(int i) const
141 {
142 btFullAssert(0 <= i && i < 3);
143 return m_el[i];
144 }
145
146 /** @brief Get a mutable reference to a row of the matrix as a vector
147 * @param i Row number 0 indexed */
148 SIMD_FORCE_INLINE btVector3& operator[](int i)
149 {
150 btFullAssert(0 <= i && i < 3);
151 return m_el[i];
152 }
153
154 /** @brief Get a const reference to a row of the matrix as a vector
155 * @param i Row number 0 indexed */
156 SIMD_FORCE_INLINE const btVector3& operator[](int i) const
157 {
158 btFullAssert(0 <= i && i < 3);
159 return m_el[i];
160 }
161
162 /** @brief Multiply by the target matrix on the right
163 * @param m Rotation matrix to be applied
164 * Equivilant to this = this * m */
165 btMatrix3x3& operator*=(const btMatrix3x3& m);
166
167 /** @brief Adds by the target matrix on the right
168 * @param m matrix to be applied
169 * Equivilant to this = this + m */
170 btMatrix3x3& operator+=(const btMatrix3x3& m);
171
172 /** @brief Substractss by the target matrix on the right
173 * @param m matrix to be applied
174 * Equivilant to this = this - m */
175 btMatrix3x3& operator-=(const btMatrix3x3& m);
176
177 /** @brief Set from the rotational part of a 4x4 OpenGL matrix
178 * @param m A pointer to the beginning of the array of scalars*/
179 void setFromOpenGLSubMatrix(const btScalar* m)
180 {
181 m_el[0].setValue(m[0], m[4], m[8]);
182 m_el[1].setValue(m[1], m[5], m[9]);
183 m_el[2].setValue(m[2], m[6], m[10]);
184 }
185 /** @brief Set the values of the matrix explicitly (row major)
186 * @param xx Top left
187 * @param xy Top Middle
188 * @param xz Top Right
189 * @param yx Middle Left
190 * @param yy Middle Middle
191 * @param yz Middle Right
192 * @param zx Bottom Left
193 * @param zy Bottom Middle
194 * @param zz Bottom Right*/
195 void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz,
196 const btScalar& yx, const btScalar& yy, const btScalar& yz,
197 const btScalar& zx, const btScalar& zy, const btScalar& zz)
198 {
199 m_el[0].setValue(xx, xy, xz);
200 m_el[1].setValue(yx, yy, yz);
201 m_el[2].setValue(zx, zy, zz);
202 }
203
204 /** @brief Set the matrix from a quaternion
205 * @param q The Quaternion to match */
206 void setRotation(const btQuaternion& q)
207 {
208 btScalar d = q.length2();
209 btFullAssert(d != btScalar(0.0));
210 btScalar s = btScalar(2.0) / d;
211
212 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
213 __m128 vs, Q = q.get128();
214 __m128i Qi = btCastfTo128i(Q);
215 __m128 Y, Z;
216 __m128 V1, V2, V3;
217 __m128 V11, V21, V31;
218 __m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
219 __m128i NQi = btCastfTo128i(NQ);
220
221 V1 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 2, 3))); // Y X Z W
222 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0, 0, 1, 3)); // -X -X Y W
223 V3 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(2, 1, 0, 3))); // Z Y X W
224 V1 = _mm_xor_ps(V1, vMPPP); // change the sign of the first element
225
226 V11 = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 1, 0, 3))); // Y Y X W
227 V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W
228 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0, 2, 0, 3)); // X Z -X -W
229
230 V2 = V2 * V1; //
231 V1 = V1 * V11; //
232 V3 = V3 * V31; //
233
234 V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2, 3, 1, 3)); // -Z -W Y W
235 V11 = V11 * V21; //
236 V21 = _mm_xor_ps(V21, vMPPP); // change the sign of the first element
237 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3, 3, 1, 3)); // W W -Y -W
238 V31 = _mm_xor_ps(V31, vMPPP); // change the sign of the first element
239 Y = btCastiTo128f(_mm_shuffle_epi32(NQi, BT_SHUFFLE(3, 2, 0, 3))); // -W -Z -X -W
240 Z = btCastiTo128f(_mm_shuffle_epi32(Qi, BT_SHUFFLE(1, 0, 1, 3))); // Y X Y W
241
242 vs = _mm_load_ss(&s);
243 V21 = V21 * Y;
244 V31 = V31 * Z;
245
246 V1 = V1 + V11;
247 V2 = V2 + V21;
248 V3 = V3 + V31;
249
250 vs = bt_splat3_ps(vs, 0);
251 // s ready
252 V1 = V1 * vs;
253 V2 = V2 * vs;
254 V3 = V3 * vs;
255
256 V1 = V1 + v1000;
257 V2 = V2 + v0100;
258 V3 = V3 + v0010;
259
260 m_el[0] = V1;
261 m_el[1] = V2;
262 m_el[2] = V3;
263 #else
264 btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s;
265 btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs;
266 btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs;
267 btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs;
268 setValue(
269 btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
270 xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
271 xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
272 #endif
273 }
274
275 /** @brief Set the matrix from euler angles using YPR around YXZ respectively
276 * @param yaw Yaw about Y axis
277 * @param pitch Pitch about X axis
278 * @param roll Roll about Z axis
279 */
280 void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
281 {
282 setEulerZYX(roll, pitch, yaw);
283 }
284
285 /** @brief Set the matrix from euler angles YPR around ZYX axes
286 * @param eulerX Roll about X axis
287 * @param eulerY Pitch around Y axis
288 * @param eulerZ Yaw about Z axis
289 *
290 * These angles are used to produce a rotation matrix. The euler
291 * angles are applied in ZYX order. I.e a vector is first rotated
292 * about X then Y and then Z
293 **/
294 void setEulerZYX(btScalar eulerX, btScalar eulerY, btScalar eulerZ)
295 {
296 ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
297 btScalar ci(btCos(eulerX));
298 btScalar cj(btCos(eulerY));
299 btScalar ch(btCos(eulerZ));
300 btScalar si(btSin(eulerX));
301 btScalar sj(btSin(eulerY));
302 btScalar sh(btSin(eulerZ));
303 btScalar cc = ci * ch;
304 btScalar cs = ci * sh;
305 btScalar sc = si * ch;
306 btScalar ss = si * sh;
307
308 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
309 cj * sh, sj * ss + cc, sj * cs - sc,
310 -sj, cj * si, cj * ci);
311 }
312
313 /**@brief Set the matrix to the identity */
314 void setIdentity()
315 {
316 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
317 m_el[0] = v1000;
318 m_el[1] = v0100;
319 m_el[2] = v0010;
320 #else
321 setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0),
322 btScalar(0.0), btScalar(1.0), btScalar(0.0),
323 btScalar(0.0), btScalar(0.0), btScalar(1.0));
324 #endif
325 }
326
327 static const btMatrix3x3& getIdentity()
328 {
329 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
330 static const btMatrix3x3
331 identityMatrix(v1000, v0100, v0010);
332 #else
333 static const btMatrix3x3
334 identityMatrix(
335 btScalar(1.0), btScalar(0.0), btScalar(0.0),
336 btScalar(0.0), btScalar(1.0), btScalar(0.0),
337 btScalar(0.0), btScalar(0.0), btScalar(1.0));
338 #endif
339 return identityMatrix;
340 }
341
342 /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
343 * @param m The array to be filled */
344 void getOpenGLSubMatrix(btScalar * m) const
345 {
346 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
347 __m128 v0 = m_el[0].mVec128;
348 __m128 v1 = m_el[1].mVec128;
349 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
350 __m128* vm = (__m128*)m;
351 __m128 vT;
352
353 v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
354
355 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
356 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
357
358 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0
359 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0
360 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
361
362 vm[0] = v0;
363 vm[1] = v1;
364 vm[2] = v2;
365 #elif defined(BT_USE_NEON)
366 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
367 static const uint32x2_t zMask = (const uint32x2_t){static_cast<uint32_t>(-1), 0};
368 float32x4_t* vm = (float32x4_t*)m;
369 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
370 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0}
371 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
372 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
373 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
374 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0
375
376 vm[0] = v0;
377 vm[1] = v1;
378 vm[2] = v2;
379 #else
380 m[0] = btScalar(m_el[0].x());
381 m[1] = btScalar(m_el[1].x());
382 m[2] = btScalar(m_el[2].x());
383 m[3] = btScalar(0.0);
384 m[4] = btScalar(m_el[0].y());
385 m[5] = btScalar(m_el[1].y());
386 m[6] = btScalar(m_el[2].y());
387 m[7] = btScalar(0.0);
388 m[8] = btScalar(m_el[0].z());
389 m[9] = btScalar(m_el[1].z());
390 m[10] = btScalar(m_el[2].z());
391 m[11] = btScalar(0.0);
392 #endif
393 }
394
395 /**@brief Get the matrix represented as a quaternion
396 * @param q The quaternion which will be set */
397 void getRotation(btQuaternion & q) const
398 {
399 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
400 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
401 btScalar s, x;
402
403 union {
404 btSimdFloat4 vec;
405 btScalar f[4];
406 } temp;
407
408 if (trace > btScalar(0.0))
409 {
410 x = trace + btScalar(1.0);
411
412 temp.f[0] = m_el[2].y() - m_el[1].z();
413 temp.f[1] = m_el[0].z() - m_el[2].x();
414 temp.f[2] = m_el[1].x() - m_el[0].y();
415 temp.f[3] = x;
416 //temp.f[3]= s * btScalar(0.5);
417 }
418 else
419 {
420 int i, j, k;
421 if (m_el[0].x() < m_el[1].y())
422 {
423 if (m_el[1].y() < m_el[2].z())
424 {
425 i = 2;
426 j = 0;
427 k = 1;
428 }
429 else
430 {
431 i = 1;
432 j = 2;
433 k = 0;
434 }
435 }
436 else
437 {
438 if (m_el[0].x() < m_el[2].z())
439 {
440 i = 2;
441 j = 0;
442 k = 1;
443 }
444 else
445 {
446 i = 0;
447 j = 1;
448 k = 2;
449 }
450 }
451
452 x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
453
454 temp.f[3] = (m_el[k][j] - m_el[j][k]);
455 temp.f[j] = (m_el[j][i] + m_el[i][j]);
456 temp.f[k] = (m_el[k][i] + m_el[i][k]);
457 temp.f[i] = x;
458 //temp.f[i] = s * btScalar(0.5);
459 }
460
461 s = btSqrt(x);
462 q.set128(temp.vec);
463 s = btScalar(0.5) / s;
464
465 q *= s;
466 #else
467 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
468
469 btScalar temp[4];
470
471 if (trace > btScalar(0.0))
472 {
473 btScalar s = btSqrt(trace + btScalar(1.0));
474 temp[3] = (s * btScalar(0.5));
475 s = btScalar(0.5) / s;
476
477 temp[0] = ((m_el[2].y() - m_el[1].z()) * s);
478 temp[1] = ((m_el[0].z() - m_el[2].x()) * s);
479 temp[2] = ((m_el[1].x() - m_el[0].y()) * s);
480 }
481 else
482 {
483 int i = m_el[0].x() < m_el[1].y() ? (m_el[1].y() < m_el[2].z() ? 2 : 1) : (m_el[0].x() < m_el[2].z() ? 2 : 0);
484 int j = (i + 1) % 3;
485 int k = (i + 2) % 3;
486
487 btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
488 temp[i] = s * btScalar(0.5);
489 s = btScalar(0.5) / s;
490
491 temp[3] = (m_el[k][j] - m_el[j][k]) * s;
492 temp[j] = (m_el[j][i] + m_el[i][j]) * s;
493 temp[k] = (m_el[k][i] + m_el[i][k]) * s;
494 }
495 q.setValue(temp[0], temp[1], temp[2], temp[3]);
496 #endif
497 }
498
499 /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
500 * @param yaw Yaw around Y axis
501 * @param pitch Pitch around X axis
502 * @param roll around Z axis */
503 void getEulerYPR(btScalar & yaw, btScalar & pitch, btScalar & roll) const
504 {
505 // first use the normal calculus
506 yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
507 pitch = btScalar(btAsin(-m_el[2].x()));
508 roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
509
510 // on pitch = +/-HalfPI
511 if (btFabs(pitch) == SIMD_HALF_PI)
512 {
513 if (yaw > 0)
514 yaw -= SIMD_PI;
515 else
516 yaw += SIMD_PI;
517
518 if (roll > 0)
519 roll -= SIMD_PI;
520 else
521 roll += SIMD_PI;
522 }
523 };
524
525 /**@brief Get the matrix represented as euler angles around ZYX
526 * @param yaw Yaw around Z axis
527 * @param pitch Pitch around Y axis
528 * @param roll around X axis
529 * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
530 void getEulerZYX(btScalar & yaw, btScalar & pitch, btScalar & roll, unsigned int solution_number = 1) const
531 {
532 struct Euler
533 {
534 btScalar yaw;
535 btScalar pitch;
536 btScalar roll;
537 };
538
539 Euler euler_out;
540 Euler euler_out2; //second solution
541 //get the pointer to the raw data
542
543 // Check that pitch is not at a singularity
544 if (btFabs(m_el[2].x()) >= 1)
545 {
546 euler_out.yaw = 0;
547 euler_out2.yaw = 0;
548
549 // From difference of angles formula
550 btScalar delta = btAtan2(m_el[0].x(), m_el[0].z());
551 if (m_el[2].x() > 0) //gimbal locked up
552 {
553 euler_out.pitch = SIMD_PI / btScalar(2.0);
554 euler_out2.pitch = SIMD_PI / btScalar(2.0);
555 euler_out.roll = euler_out.pitch + delta;
556 euler_out2.roll = euler_out.pitch + delta;
557 }
558 else // gimbal locked down
559 {
560 euler_out.pitch = -SIMD_PI / btScalar(2.0);
561 euler_out2.pitch = -SIMD_PI / btScalar(2.0);
562 euler_out.roll = -euler_out.pitch + delta;
563 euler_out2.roll = -euler_out.pitch + delta;
564 }
565 }
566 else
567 {
568 euler_out.pitch = -btAsin(m_el[2].x());
569 euler_out2.pitch = SIMD_PI - euler_out.pitch;
570
571 euler_out.roll = btAtan2(m_el[2].y() / btCos(euler_out.pitch),
572 m_el[2].z() / btCos(euler_out.pitch));
573 euler_out2.roll = btAtan2(m_el[2].y() / btCos(euler_out2.pitch),
574 m_el[2].z() / btCos(euler_out2.pitch));
575
576 euler_out.yaw = btAtan2(m_el[1].x() / btCos(euler_out.pitch),
577 m_el[0].x() / btCos(euler_out.pitch));
578 euler_out2.yaw = btAtan2(m_el[1].x() / btCos(euler_out2.pitch),
579 m_el[0].x() / btCos(euler_out2.pitch));
580 }
581
582 if (solution_number == 1)
583 {
584 yaw = euler_out.yaw;
585 pitch = euler_out.pitch;
586 roll = euler_out.roll;
587 }
588 else
589 {
590 yaw = euler_out2.yaw;
591 pitch = euler_out2.pitch;
592 roll = euler_out2.roll;
593 }
594 }
595
596 /**@brief Create a scaled copy of the matrix
597 * @param s Scaling vector The elements of the vector will scale each column */
598
599 btMatrix3x3 scaled(const btVector3& s) const
600 {
601 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
602 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
603 #else
604 return btMatrix3x3(
605 m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
606 m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
607 m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
608 #endif
609 }
610
611 /**@brief Return the determinant of the matrix */
612 btScalar determinant() const;
613 /**@brief Return the adjoint of the matrix */
614 btMatrix3x3 adjoint() const;
615 /**@brief Return the matrix with all values non negative */
616 btMatrix3x3 absolute() const;
617 /**@brief Return the transpose of the matrix */
618 btMatrix3x3 transpose() const;
619 /**@brief Return the inverse of the matrix */
620 btMatrix3x3 inverse() const;
621
622 /// Solve A * x = b, where b is a column vector. This is more efficient
623 /// than computing the inverse in one-shot cases.
624 ///Solve33 is from Box2d, thanks to Erin Catto,
625 btVector3 solve33(const btVector3& b) const
626 {
627 btVector3 col1 = getColumn(0);
628 btVector3 col2 = getColumn(1);
629 btVector3 col3 = getColumn(2);
630
631 btScalar det = btDot(col1, btCross(col2, col3));
632 if (btFabs(det) > SIMD_EPSILON)
633 {
634 det = 1.0f / det;
635 }
636 btVector3 x;
637 x[0] = det * btDot(b, btCross(col2, col3));
638 x[1] = det * btDot(col1, btCross(b, col3));
639 x[2] = det * btDot(col1, btCross(col2, b));
640 return x;
641 }
642
643 btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
644 btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
645
646 SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const
647 {
648 return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
649 }
650 SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const
651 {
652 return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
653 }
654 SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const
655 {
656 return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
657 }
658
659 ///extractRotation is from "A robust method to extract the rotational part of deformations"
660 ///See http://dl.acm.org/citation.cfm?doid=2994258.2994269
661 ///decomposes a matrix A in a orthogonal matrix R and a
662 ///symmetric matrix S:
663 ///A = R*S.
664 ///note that R can include both rotation and scaling.
665 SIMD_FORCE_INLINE void extractRotation(btQuaternion & q, btScalar tolerance = 1.0e-9, int maxIter = 100)
666 {
667 int iter = 0;
668 btScalar w;
669 const btMatrix3x3& A = *this;
670 for (iter = 0; iter < maxIter; iter++)
671 {
672 btMatrix3x3 R(q);
673 btVector3 omega = (R.getColumn(0).cross(A.getColumn(0)) + R.getColumn(1).cross(A.getColumn(1)) + R.getColumn(2).cross(A.getColumn(2))) * (btScalar(1.0) / btFabs(R.getColumn(0).dot(A.getColumn(0)) + R.getColumn(1).dot(A.getColumn(1)) + R.getColumn(2).dot(A.getColumn(2))) +
674 tolerance);
675 w = omega.norm();
676 if (w < tolerance)
677 break;
678 q = btQuaternion(btVector3((btScalar(1.0) / w) * omega), w) *
679 q;
680 q.normalize();
681 }
682 }
683
684 /**@brief diagonalizes this matrix by the Jacobi method.
685 * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
686 * coordinate system, i.e., old_this = rot * new_this * rot^T.
687 * @param threshold See iteration
688 * @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
689 * by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
690 *
691 * Note that this matrix is assumed to be symmetric.
692 */
693 void diagonalize(btMatrix3x3 & rot, btScalar threshold, int maxSteps)
694 {
695 rot.setIdentity();
696 for (int step = maxSteps; step > 0; step--)
697 {
698 // find off-diagonal element [p][q] with largest magnitude
699 int p = 0;
700 int q = 1;
701 int r = 2;
702 btScalar max = btFabs(m_el[0][1]);
703 btScalar v = btFabs(m_el[0][2]);
704 if (v > max)
705 {
706 q = 2;
707 r = 1;
708 max = v;
709 }
710 v = btFabs(m_el[1][2]);
711 if (v > max)
712 {
713 p = 1;
714 q = 2;
715 r = 0;
716 max = v;
717 }
718
719 btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
720 if (max <= t)
721 {
722 if (max <= SIMD_EPSILON * t)
723 {
724 return;
725 }
726 step = 1;
727 }
728
729 // compute Jacobi rotation J which leads to a zero for element [p][q]
730 btScalar mpq = m_el[p][q];
731 btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
732 btScalar theta2 = theta * theta;
733 btScalar cos;
734 btScalar sin;
735 if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
736 {
737 t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
738 : 1 / (theta - btSqrt(1 + theta2));
739 cos = 1 / btSqrt(1 + t * t);
740 sin = cos * t;
741 }
742 else
743 {
744 // approximation for large theta-value, i.e., a nearly diagonal matrix
745 t = 1 / (theta * (2 + btScalar(0.5) / theta2));
746 cos = 1 - btScalar(0.5) * t * t;
747 sin = cos * t;
748 }
749
750 // apply rotation to matrix (this = J^T * this * J)
751 m_el[p][q] = m_el[q][p] = 0;
752 m_el[p][p] -= t * mpq;
753 m_el[q][q] += t * mpq;
754 btScalar mrp = m_el[r][p];
755 btScalar mrq = m_el[r][q];
756 m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
757 m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
758
759 // apply rotation to rot (rot = rot * J)
760 for (int i = 0; i < 3; i++)
761 {
762 btVector3& row = rot[i];
763 mrp = row[p];
764 mrq = row[q];
765 row[p] = cos * mrp - sin * mrq;
766 row[q] = cos * mrq + sin * mrp;
767 }
768 }
769 }
770
771 /**@brief Calculate the matrix cofactor
772 * @param r1 The first row to use for calculating the cofactor
773 * @param c1 The first column to use for calculating the cofactor
774 * @param r1 The second row to use for calculating the cofactor
775 * @param c1 The second column to use for calculating the cofactor
776 * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
777 */
778 btScalar cofac(int r1, int c1, int r2, int c2) const
779 {
780 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
781 }
782
783 void serialize(struct btMatrix3x3Data & dataOut) const;
784
785 void serializeFloat(struct btMatrix3x3FloatData & dataOut) const;
786
787 void deSerialize(const struct btMatrix3x3Data& dataIn);
788
789 void deSerializeFloat(const struct btMatrix3x3FloatData& dataIn);
790
791 void deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn);
792 };
793
794 SIMD_FORCE_INLINE btMatrix3x3&
795 btMatrix3x3::operator*=(const btMatrix3x3& m)
796 {
797 #if defined BT_USE_SIMD_VECTOR3 && defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)
798 __m128 rv00, rv01, rv02;
799 __m128 rv10, rv11, rv12;
800 __m128 rv20, rv21, rv22;
801 __m128 mv0, mv1, mv2;
802
803 rv02 = m_el[0].mVec128;
804 rv12 = m_el[1].mVec128;
805 rv22 = m_el[2].mVec128;
806
807 mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
808 mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
809 mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
810
811 // rv0
812 rv00 = bt_splat_ps(rv02, 0);
813 rv01 = bt_splat_ps(rv02, 1);
814 rv02 = bt_splat_ps(rv02, 2);
815
816 rv00 = _mm_mul_ps(rv00, mv0);
817 rv01 = _mm_mul_ps(rv01, mv1);
818 rv02 = _mm_mul_ps(rv02, mv2);
819
820 // rv1
821 rv10 = bt_splat_ps(rv12, 0);
822 rv11 = bt_splat_ps(rv12, 1);
823 rv12 = bt_splat_ps(rv12, 2);
824
825 rv10 = _mm_mul_ps(rv10, mv0);
826 rv11 = _mm_mul_ps(rv11, mv1);
827 rv12 = _mm_mul_ps(rv12, mv2);
828
829 // rv2
830 rv20 = bt_splat_ps(rv22, 0);
831 rv21 = bt_splat_ps(rv22, 1);
832 rv22 = bt_splat_ps(rv22, 2);
833
834 rv20 = _mm_mul_ps(rv20, mv0);
835 rv21 = _mm_mul_ps(rv21, mv1);
836 rv22 = _mm_mul_ps(rv22, mv2);
837
838 rv00 = _mm_add_ps(rv00, rv01);
839 rv10 = _mm_add_ps(rv10, rv11);
840 rv20 = _mm_add_ps(rv20, rv21);
841
842 m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
843 m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
844 m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
845
846 #elif defined(BT_USE_NEON)
847
848 float32x4_t rv0, rv1, rv2;
849 float32x4_t v0, v1, v2;
850 float32x4_t mv0, mv1, mv2;
851
852 v0 = m_el[0].mVec128;
853 v1 = m_el[1].mVec128;
854 v2 = m_el[2].mVec128;
855
856 mv0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
857 mv1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
858 mv2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
859
860 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
861 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
862 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
863
864 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
865 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
866 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
867
868 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
869 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
870 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
871
872 m_el[0].mVec128 = rv0;
873 m_el[1].mVec128 = rv1;
874 m_el[2].mVec128 = rv2;
875 #else
876 setValue(
877 m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
878 m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
879 m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
880 #endif
881 return *this;
882 }
883
884 SIMD_FORCE_INLINE btMatrix3x3&
885 btMatrix3x3::operator+=(const btMatrix3x3& m)
886 {
887 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
888 m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
889 m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
890 m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
891 #else
892 setValue(
893 m_el[0][0] + m.m_el[0][0],
894 m_el[0][1] + m.m_el[0][1],
895 m_el[0][2] + m.m_el[0][2],
896 m_el[1][0] + m.m_el[1][0],
897 m_el[1][1] + m.m_el[1][1],
898 m_el[1][2] + m.m_el[1][2],
899 m_el[2][0] + m.m_el[2][0],
900 m_el[2][1] + m.m_el[2][1],
901 m_el[2][2] + m.m_el[2][2]);
902 #endif
903 return *this;
904 }
905
906 SIMD_FORCE_INLINE btMatrix3x3
907 operator*(const btMatrix3x3& m, const btScalar& k)
908 {
909 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
910 __m128 vk = bt_splat_ps(_mm_load_ss((float*)&k), 0x80);
911 return btMatrix3x3(
912 _mm_mul_ps(m[0].mVec128, vk),
913 _mm_mul_ps(m[1].mVec128, vk),
914 _mm_mul_ps(m[2].mVec128, vk));
915 #elif defined(BT_USE_NEON)
916 return btMatrix3x3(
917 vmulq_n_f32(m[0].mVec128, k),
918 vmulq_n_f32(m[1].mVec128, k),
919 vmulq_n_f32(m[2].mVec128, k));
920 #else
921 return btMatrix3x3(
922 m[0].x() * k, m[0].y() * k, m[0].z() * k,
923 m[1].x() * k, m[1].y() * k, m[1].z() * k,
924 m[2].x() * k, m[2].y() * k, m[2].z() * k);
925 #endif
926 }
927
928 SIMD_FORCE_INLINE btMatrix3x3
929 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
930 {
931 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
932 return btMatrix3x3(
933 m1[0].mVec128 + m2[0].mVec128,
934 m1[1].mVec128 + m2[1].mVec128,
935 m1[2].mVec128 + m2[2].mVec128);
936 #else
937 return btMatrix3x3(
938 m1[0][0] + m2[0][0],
939 m1[0][1] + m2[0][1],
940 m1[0][2] + m2[0][2],
941
942 m1[1][0] + m2[1][0],
943 m1[1][1] + m2[1][1],
944 m1[1][2] + m2[1][2],
945
946 m1[2][0] + m2[2][0],
947 m1[2][1] + m2[2][1],
948 m1[2][2] + m2[2][2]);
949 #endif
950 }
951
952 SIMD_FORCE_INLINE btMatrix3x3
953 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
954 {
955 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
956 return btMatrix3x3(
957 m1[0].mVec128 - m2[0].mVec128,
958 m1[1].mVec128 - m2[1].mVec128,
959 m1[2].mVec128 - m2[2].mVec128);
960 #else
961 return btMatrix3x3(
962 m1[0][0] - m2[0][0],
963 m1[0][1] - m2[0][1],
964 m1[0][2] - m2[0][2],
965
966 m1[1][0] - m2[1][0],
967 m1[1][1] - m2[1][1],
968 m1[1][2] - m2[1][2],
969
970 m1[2][0] - m2[2][0],
971 m1[2][1] - m2[2][1],
972 m1[2][2] - m2[2][2]);
973 #endif
974 }
975
976 SIMD_FORCE_INLINE btMatrix3x3&
977 btMatrix3x3::operator-=(const btMatrix3x3& m)
978 {
979 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
980 m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
981 m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
982 m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
983 #else
984 setValue(
985 m_el[0][0] - m.m_el[0][0],
986 m_el[0][1] - m.m_el[0][1],
987 m_el[0][2] - m.m_el[0][2],
988 m_el[1][0] - m.m_el[1][0],
989 m_el[1][1] - m.m_el[1][1],
990 m_el[1][2] - m.m_el[1][2],
991 m_el[2][0] - m.m_el[2][0],
992 m_el[2][1] - m.m_el[2][1],
993 m_el[2][2] - m.m_el[2][2]);
994 #endif
995 return *this;
996 }
997
998 SIMD_FORCE_INLINE btScalar
determinant()999 btMatrix3x3::determinant() const
1000 {
1001 return btTriple((*this)[0], (*this)[1], (*this)[2]);
1002 }
1003
1004 SIMD_FORCE_INLINE btMatrix3x3
absolute()1005 btMatrix3x3::absolute() const
1006 {
1007 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1008 return btMatrix3x3(
1009 _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
1010 _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
1011 _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
1012 #elif defined(BT_USE_NEON)
1013 return btMatrix3x3(
1014 (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
1015 (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
1016 (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
1017 #else
1018 return btMatrix3x3(
1019 btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
1020 btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
1021 btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
1022 #endif
1023 }
1024
1025 SIMD_FORCE_INLINE btMatrix3x3
transpose()1026 btMatrix3x3::transpose() const
1027 {
1028 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1029 __m128 v0 = m_el[0].mVec128;
1030 __m128 v1 = m_el[1].mVec128;
1031 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
1032 __m128 vT;
1033
1034 v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
1035
1036 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
1037 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
1038
1039 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3)); // y0 y1 y2 0
1040 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3)); // x0 x1 x2 0
1041 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
1042
1043 return btMatrix3x3(v0, v1, v2);
1044 #elif defined(BT_USE_NEON)
1045 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
1046 static const uint32x2_t zMask = (const uint32x2_t){static_cast<uint32_t>(-1), 0};
1047 float32x4x2_t top = vtrnq_f32(m_el[0].mVec128, m_el[1].mVec128); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
1048 float32x2x2_t bl = vtrn_f32(vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f)); // {x2 0 }, {y2 0}
1049 float32x4_t v0 = vcombine_f32(vget_low_f32(top.val[0]), bl.val[0]);
1050 float32x4_t v1 = vcombine_f32(vget_low_f32(top.val[1]), bl.val[1]);
1051 float32x2_t q = (float32x2_t)vand_u32((uint32x2_t)vget_high_f32(m_el[2].mVec128), zMask);
1052 float32x4_t v2 = vcombine_f32(vget_high_f32(top.val[0]), q); // z0 z1 z2 0
1053 return btMatrix3x3(v0, v1, v2);
1054 #else
1055 return btMatrix3x3(m_el[0].x(), m_el[1].x(), m_el[2].x(),
1056 m_el[0].y(), m_el[1].y(), m_el[2].y(),
1057 m_el[0].z(), m_el[1].z(), m_el[2].z());
1058 #endif
1059 }
1060
1061 SIMD_FORCE_INLINE btMatrix3x3
adjoint()1062 btMatrix3x3::adjoint() const
1063 {
1064 return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
1065 cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
1066 cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
1067 }
1068
1069 SIMD_FORCE_INLINE btMatrix3x3
inverse()1070 btMatrix3x3::inverse() const
1071 {
1072 btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
1073 btScalar det = (*this)[0].dot(co);
1074 //btFullAssert(det != btScalar(0.0));
1075 btAssert(det != btScalar(0.0));
1076 btScalar s = btScalar(1.0) / det;
1077 return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
1078 co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
1079 co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
1080 }
1081
1082 SIMD_FORCE_INLINE btMatrix3x3
transposeTimes(const btMatrix3x3 & m)1083 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
1084 {
1085 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1086 // zeros w
1087 // static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1088 __m128 row = m_el[0].mVec128;
1089 __m128 m0 = _mm_and_ps(m.getRow(0).mVec128, btvFFF0fMask);
1090 __m128 m1 = _mm_and_ps(m.getRow(1).mVec128, btvFFF0fMask);
1091 __m128 m2 = _mm_and_ps(m.getRow(2).mVec128, btvFFF0fMask);
1092 __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1093 __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1094 __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1095 row = m_el[1].mVec128;
1096 r0 = _mm_add_ps(r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1097 r1 = _mm_add_ps(r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1098 r2 = _mm_add_ps(r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1099 row = m_el[2].mVec128;
1100 r0 = _mm_add_ps(r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1101 r1 = _mm_add_ps(r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1102 r2 = _mm_add_ps(r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1103 return btMatrix3x3(r0, r1, r2);
1104
1105 #elif defined BT_USE_NEON
1106 // zeros w
1107 static const uint32x4_t xyzMask = (const uint32x4_t){static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0};
1108 float32x4_t m0 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(0).mVec128, xyzMask);
1109 float32x4_t m1 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(1).mVec128, xyzMask);
1110 float32x4_t m2 = (float32x4_t)vandq_u32((uint32x4_t)m.getRow(2).mVec128, xyzMask);
1111 float32x4_t row = m_el[0].mVec128;
1112 float32x4_t r0 = vmulq_lane_f32(m0, vget_low_f32(row), 0);
1113 float32x4_t r1 = vmulq_lane_f32(m0, vget_low_f32(row), 1);
1114 float32x4_t r2 = vmulq_lane_f32(m0, vget_high_f32(row), 0);
1115 row = m_el[1].mVec128;
1116 r0 = vmlaq_lane_f32(r0, m1, vget_low_f32(row), 0);
1117 r1 = vmlaq_lane_f32(r1, m1, vget_low_f32(row), 1);
1118 r2 = vmlaq_lane_f32(r2, m1, vget_high_f32(row), 0);
1119 row = m_el[2].mVec128;
1120 r0 = vmlaq_lane_f32(r0, m2, vget_low_f32(row), 0);
1121 r1 = vmlaq_lane_f32(r1, m2, vget_low_f32(row), 1);
1122 r2 = vmlaq_lane_f32(r2, m2, vget_high_f32(row), 0);
1123 return btMatrix3x3(r0, r1, r2);
1124 #else
1125 return btMatrix3x3(
1126 m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
1127 m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
1128 m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
1129 m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
1130 m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
1131 m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
1132 m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
1133 m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
1134 m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
1135 #endif
1136 }
1137
1138 SIMD_FORCE_INLINE btMatrix3x3
timesTranspose(const btMatrix3x3 & m)1139 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
1140 {
1141 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1142 __m128 a0 = m_el[0].mVec128;
1143 __m128 a1 = m_el[1].mVec128;
1144 __m128 a2 = m_el[2].mVec128;
1145
1146 btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1147 __m128 mx = mT[0].mVec128;
1148 __m128 my = mT[1].mVec128;
1149 __m128 mz = mT[2].mVec128;
1150
1151 __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1152 __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1153 __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1154 r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1155 r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1156 r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1157 r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1158 r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1159 r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1160 return btMatrix3x3(r0, r1, r2);
1161
1162 #elif defined BT_USE_NEON
1163 float32x4_t a0 = m_el[0].mVec128;
1164 float32x4_t a1 = m_el[1].mVec128;
1165 float32x4_t a2 = m_el[2].mVec128;
1166
1167 btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1168 float32x4_t mx = mT[0].mVec128;
1169 float32x4_t my = mT[1].mVec128;
1170 float32x4_t mz = mT[2].mVec128;
1171
1172 float32x4_t r0 = vmulq_lane_f32(mx, vget_low_f32(a0), 0);
1173 float32x4_t r1 = vmulq_lane_f32(mx, vget_low_f32(a1), 0);
1174 float32x4_t r2 = vmulq_lane_f32(mx, vget_low_f32(a2), 0);
1175 r0 = vmlaq_lane_f32(r0, my, vget_low_f32(a0), 1);
1176 r1 = vmlaq_lane_f32(r1, my, vget_low_f32(a1), 1);
1177 r2 = vmlaq_lane_f32(r2, my, vget_low_f32(a2), 1);
1178 r0 = vmlaq_lane_f32(r0, mz, vget_high_f32(a0), 0);
1179 r1 = vmlaq_lane_f32(r1, mz, vget_high_f32(a1), 0);
1180 r2 = vmlaq_lane_f32(r2, mz, vget_high_f32(a2), 0);
1181 return btMatrix3x3(r0, r1, r2);
1182
1183 #else
1184 return btMatrix3x3(
1185 m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
1186 m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
1187 m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
1188 #endif
1189 }
1190
1191 SIMD_FORCE_INLINE btVector3
1192 operator*(const btMatrix3x3& m, const btVector3& v)
1193 {
1194 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE)) || defined(BT_USE_NEON)
1195 return v.dot3(m[0], m[1], m[2]);
1196 #else
1197 return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
1198 #endif
1199 }
1200
1201 SIMD_FORCE_INLINE btVector3
1202 operator*(const btVector3& v, const btMatrix3x3& m)
1203 {
1204 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1205
1206 const __m128 vv = v.mVec128;
1207
1208 __m128 c0 = bt_splat_ps(vv, 0);
1209 __m128 c1 = bt_splat_ps(vv, 1);
1210 __m128 c2 = bt_splat_ps(vv, 2);
1211
1212 c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask));
1213 c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask));
1214 c0 = _mm_add_ps(c0, c1);
1215 c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask));
1216
1217 return btVector3(_mm_add_ps(c0, c2));
1218 #elif defined(BT_USE_NEON)
1219 const float32x4_t vv = v.mVec128;
1220 const float32x2_t vlo = vget_low_f32(vv);
1221 const float32x2_t vhi = vget_high_f32(vv);
1222
1223 float32x4_t c0, c1, c2;
1224
1225 c0 = (float32x4_t)vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1226 c1 = (float32x4_t)vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1227 c2 = (float32x4_t)vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1228
1229 c0 = vmulq_lane_f32(c0, vlo, 0);
1230 c1 = vmulq_lane_f32(c1, vlo, 1);
1231 c2 = vmulq_lane_f32(c2, vhi, 0);
1232 c0 = vaddq_f32(c0, c1);
1233 c0 = vaddq_f32(c0, c2);
1234
1235 return btVector3(c0);
1236 #else
1237 return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
1238 #endif
1239 }
1240
1241 SIMD_FORCE_INLINE btMatrix3x3
1242 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
1243 {
1244 #if defined BT_USE_SIMD_VECTOR3 && (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1245
1246 __m128 m10 = m1[0].mVec128;
1247 __m128 m11 = m1[1].mVec128;
1248 __m128 m12 = m1[2].mVec128;
1249
1250 __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1251
1252 __m128 c0 = bt_splat_ps(m10, 0);
1253 __m128 c1 = bt_splat_ps(m11, 0);
1254 __m128 c2 = bt_splat_ps(m12, 0);
1255
1256 c0 = _mm_mul_ps(c0, m2v);
1257 c1 = _mm_mul_ps(c1, m2v);
1258 c2 = _mm_mul_ps(c2, m2v);
1259
1260 m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1261
1262 __m128 c0_1 = bt_splat_ps(m10, 1);
1263 __m128 c1_1 = bt_splat_ps(m11, 1);
1264 __m128 c2_1 = bt_splat_ps(m12, 1);
1265
1266 c0_1 = _mm_mul_ps(c0_1, m2v);
1267 c1_1 = _mm_mul_ps(c1_1, m2v);
1268 c2_1 = _mm_mul_ps(c2_1, m2v);
1269
1270 m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1271
1272 c0 = _mm_add_ps(c0, c0_1);
1273 c1 = _mm_add_ps(c1, c1_1);
1274 c2 = _mm_add_ps(c2, c2_1);
1275
1276 m10 = bt_splat_ps(m10, 2);
1277 m11 = bt_splat_ps(m11, 2);
1278 m12 = bt_splat_ps(m12, 2);
1279
1280 m10 = _mm_mul_ps(m10, m2v);
1281 m11 = _mm_mul_ps(m11, m2v);
1282 m12 = _mm_mul_ps(m12, m2v);
1283
1284 c0 = _mm_add_ps(c0, m10);
1285 c1 = _mm_add_ps(c1, m11);
1286 c2 = _mm_add_ps(c2, m12);
1287
1288 return btMatrix3x3(c0, c1, c2);
1289
1290 #elif defined(BT_USE_NEON)
1291
1292 float32x4_t rv0, rv1, rv2;
1293 float32x4_t v0, v1, v2;
1294 float32x4_t mv0, mv1, mv2;
1295
1296 v0 = m1[0].mVec128;
1297 v1 = m1[1].mVec128;
1298 v2 = m1[2].mVec128;
1299
1300 mv0 = (float32x4_t)vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1301 mv1 = (float32x4_t)vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1302 mv2 = (float32x4_t)vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1303
1304 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1305 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1306 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1307
1308 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1309 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1310 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1311
1312 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1313 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1314 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1315
1316 return btMatrix3x3(rv0, rv1, rv2);
1317
1318 #else
1319 return btMatrix3x3(
1320 m2.tdotx(m1[0]), m2.tdoty(m1[0]), m2.tdotz(m1[0]),
1321 m2.tdotx(m1[1]), m2.tdoty(m1[1]), m2.tdotz(m1[1]),
1322 m2.tdotx(m1[2]), m2.tdoty(m1[2]), m2.tdotz(m1[2]));
1323 #endif
1324 }
1325
1326 /*
1327 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
1328 return btMatrix3x3(
1329 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
1330 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
1331 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
1332 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
1333 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
1334 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
1335 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
1336 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
1337 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
1338 }
1339 */
1340
1341 /**@brief Equality operator between two matrices
1342 * It will test all elements are equal. */
1343 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
1344 {
1345 #if (defined(BT_USE_SSE_IN_API) && defined(BT_USE_SSE))
1346
1347 __m128 c0, c1, c2;
1348
1349 c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1350 c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1351 c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1352
1353 c0 = _mm_and_ps(c0, c1);
1354 c0 = _mm_and_ps(c0, c2);
1355
1356 int m = _mm_movemask_ps((__m128)c0);
1357 return (0x7 == (m & 0x7));
1358
1359 #else
1360 return (m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1361 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1362 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2]);
1363 #endif
1364 }
1365
1366 ///for serialization
1367 struct btMatrix3x3FloatData
1368 {
1369 btVector3FloatData m_el[3];
1370 };
1371
1372 ///for serialization
1373 struct btMatrix3x3DoubleData
1374 {
1375 btVector3DoubleData m_el[3];
1376 };
1377
serialize(struct btMatrix3x3Data & dataOut)1378 SIMD_FORCE_INLINE void btMatrix3x3::serialize(struct btMatrix3x3Data& dataOut) const
1379 {
1380 for (int i = 0; i < 3; i++)
1381 m_el[i].serialize(dataOut.m_el[i]);
1382 }
1383
serializeFloat(struct btMatrix3x3FloatData & dataOut)1384 SIMD_FORCE_INLINE void btMatrix3x3::serializeFloat(struct btMatrix3x3FloatData& dataOut) const
1385 {
1386 for (int i = 0; i < 3; i++)
1387 m_el[i].serializeFloat(dataOut.m_el[i]);
1388 }
1389
deSerialize(const struct btMatrix3x3Data & dataIn)1390 SIMD_FORCE_INLINE void btMatrix3x3::deSerialize(const struct btMatrix3x3Data& dataIn)
1391 {
1392 for (int i = 0; i < 3; i++)
1393 m_el[i].deSerialize(dataIn.m_el[i]);
1394 }
1395
deSerializeFloat(const struct btMatrix3x3FloatData & dataIn)1396 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeFloat(const struct btMatrix3x3FloatData& dataIn)
1397 {
1398 for (int i = 0; i < 3; i++)
1399 m_el[i].deSerializeFloat(dataIn.m_el[i]);
1400 }
1401
deSerializeDouble(const struct btMatrix3x3DoubleData & dataIn)1402 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn)
1403 {
1404 for (int i = 0; i < 3; i++)
1405 m_el[i].deSerializeDouble(dataIn.m_el[i]);
1406 }
1407
1408 #endif //BT_MATRIX3x3_H
1409