1 /*
2 Copyright (c) 2003-2006 Gino van den Bergen / Erwin Coumans http://continuousphysics.com/Bullet/
3
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14
15
16 #ifndef BT_MATRIX3x3_H
17 #define BT_MATRIX3x3_H
18
19 #include "btVector3.h"
20 #include "btQuaternion.h"
21 #include <stdio.h>
22
23 #ifdef BT_USE_SSE
24 //const __m128 ATTRIBUTE_ALIGNED16(v2220) = {2.0f, 2.0f, 2.0f, 0.0f};
25 //const __m128 ATTRIBUTE_ALIGNED16(vMPPP) = {-0.0f, +0.0f, +0.0f, +0.0f};
26 #define vMPPP (_mm_set_ps (+0.0f, +0.0f, +0.0f, -0.0f))
27 #endif
28
29 #if defined(BT_USE_SSE)
30 #define v1000 (_mm_set_ps(0.0f,0.0f,0.0f,1.0f))
31 #define v0100 (_mm_set_ps(0.0f,0.0f,1.0f,0.0f))
32 #define v0010 (_mm_set_ps(0.0f,1.0f,0.0f,0.0f))
33 #elif defined(BT_USE_NEON)
34 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v1000) = {1.0f, 0.0f, 0.0f, 0.0f};
35 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0100) = {0.0f, 1.0f, 0.0f, 0.0f};
36 const btSimdFloat4 ATTRIBUTE_ALIGNED16(v0010) = {0.0f, 0.0f, 1.0f, 0.0f};
37 #endif
38
39 #ifdef BT_USE_DOUBLE_PRECISION
40 #define btMatrix3x3Data btMatrix3x3DoubleData
41 #else
42 #define btMatrix3x3Data btMatrix3x3FloatData
43 #endif //BT_USE_DOUBLE_PRECISION
44
45
46 /**@brief The btMatrix3x3 class implements a 3x3 rotation matrix, to perform linear algebra in combination with btQuaternion, btTransform and btVector3.
47 * Make sure to only include a pure orthogonal matrix without scaling. */
ATTRIBUTE_ALIGNED16(class)48 ATTRIBUTE_ALIGNED16(class) btMatrix3x3 {
49
50 ///Data storage for the matrix, each vector is a row of the matrix
51 btVector3 m_el[3];
52
53 public:
54 /** @brief No initializaion constructor */
55 btMatrix3x3 () {}
56
57 // explicit btMatrix3x3(const btScalar *m) { setFromOpenGLSubMatrix(m); }
58
59 /**@brief Constructor from Quaternion */
60 explicit btMatrix3x3(const btQuaternion& q) { setRotation(q); }
61 /*
62 template <typename btScalar>
63 Matrix3x3(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
64 {
65 setEulerYPR(yaw, pitch, roll);
66 }
67 */
68 /** @brief Constructor with row major formatting */
69 btMatrix3x3(const btScalar& xx, const btScalar& xy, const btScalar& xz,
70 const btScalar& yx, const btScalar& yy, const btScalar& yz,
71 const btScalar& zx, const btScalar& zy, const btScalar& zz)
72 {
73 setValue(xx, xy, xz,
74 yx, yy, yz,
75 zx, zy, zz);
76 }
77
78 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
79 SIMD_FORCE_INLINE btMatrix3x3 (const btSimdFloat4 v0, const btSimdFloat4 v1, const btSimdFloat4 v2 )
80 {
81 m_el[0].mVec128 = v0;
82 m_el[1].mVec128 = v1;
83 m_el[2].mVec128 = v2;
84 }
85
86 SIMD_FORCE_INLINE btMatrix3x3 (const btVector3& v0, const btVector3& v1, const btVector3& v2 )
87 {
88 m_el[0] = v0;
89 m_el[1] = v1;
90 m_el[2] = v2;
91 }
92
93 // Copy constructor
94 SIMD_FORCE_INLINE btMatrix3x3(const btMatrix3x3& rhs)
95 {
96 m_el[0].mVec128 = rhs.m_el[0].mVec128;
97 m_el[1].mVec128 = rhs.m_el[1].mVec128;
98 m_el[2].mVec128 = rhs.m_el[2].mVec128;
99 }
100
101 // Assignment Operator
102 SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& m)
103 {
104 m_el[0].mVec128 = m.m_el[0].mVec128;
105 m_el[1].mVec128 = m.m_el[1].mVec128;
106 m_el[2].mVec128 = m.m_el[2].mVec128;
107
108 return *this;
109 }
110
111 #else
112
113 /** @brief Copy constructor */
114 SIMD_FORCE_INLINE btMatrix3x3 (const btMatrix3x3& other)
115 {
116 m_el[0] = other.m_el[0];
117 m_el[1] = other.m_el[1];
118 m_el[2] = other.m_el[2];
119 }
120
121 /** @brief Assignment Operator */
122 SIMD_FORCE_INLINE btMatrix3x3& operator=(const btMatrix3x3& other)
123 {
124 m_el[0] = other.m_el[0];
125 m_el[1] = other.m_el[1];
126 m_el[2] = other.m_el[2];
127 return *this;
128 }
129
130 #endif
131
132 /** @brief Get a column of the matrix as a vector
133 * @param i Column number 0 indexed */
134 SIMD_FORCE_INLINE btVector3 getColumn(int i) const
135 {
136 return btVector3(m_el[0][i],m_el[1][i],m_el[2][i]);
137 }
138
139
140 /** @brief Get a row of the matrix as a vector
141 * @param i Row number 0 indexed */
142 SIMD_FORCE_INLINE const btVector3& getRow(int i) const
143 {
144 btFullAssert(0 <= i && i < 3);
145 return m_el[i];
146 }
147
148 /** @brief Get a mutable reference to a row of the matrix as a vector
149 * @param i Row number 0 indexed */
150 SIMD_FORCE_INLINE btVector3& operator[](int i)
151 {
152 btFullAssert(0 <= i && i < 3);
153 return m_el[i];
154 }
155
156 /** @brief Get a const reference to a row of the matrix as a vector
157 * @param i Row number 0 indexed */
158 SIMD_FORCE_INLINE const btVector3& operator[](int i) const
159 {
160 btFullAssert(0 <= i && i < 3);
161 return m_el[i];
162 }
163
164 /** @brief Multiply by the target matrix on the right
165 * @param m Rotation matrix to be applied
166 * Equivilant to this = this * m */
167 btMatrix3x3& operator*=(const btMatrix3x3& m);
168
169 /** @brief Adds by the target matrix on the right
170 * @param m matrix to be applied
171 * Equivilant to this = this + m */
172 btMatrix3x3& operator+=(const btMatrix3x3& m);
173
174 /** @brief Substractss by the target matrix on the right
175 * @param m matrix to be applied
176 * Equivilant to this = this - m */
177 btMatrix3x3& operator-=(const btMatrix3x3& m);
178
179 /** @brief Set from the rotational part of a 4x4 OpenGL matrix
180 * @param m A pointer to the beginning of the array of scalars*/
181 void setFromOpenGLSubMatrix(const btScalar *m)
182 {
183 m_el[0].setValue(m[0],m[4],m[8]);
184 m_el[1].setValue(m[1],m[5],m[9]);
185 m_el[2].setValue(m[2],m[6],m[10]);
186
187 }
188 /** @brief Set the values of the matrix explicitly (row major)
189 * @param xx Top left
190 * @param xy Top Middle
191 * @param xz Top Right
192 * @param yx Middle Left
193 * @param yy Middle Middle
194 * @param yz Middle Right
195 * @param zx Bottom Left
196 * @param zy Bottom Middle
197 * @param zz Bottom Right*/
198 void setValue(const btScalar& xx, const btScalar& xy, const btScalar& xz,
199 const btScalar& yx, const btScalar& yy, const btScalar& yz,
200 const btScalar& zx, const btScalar& zy, const btScalar& zz)
201 {
202 m_el[0].setValue(xx,xy,xz);
203 m_el[1].setValue(yx,yy,yz);
204 m_el[2].setValue(zx,zy,zz);
205 }
206
207 /** @brief Set the matrix from a quaternion
208 * @param q The Quaternion to match */
209 void setRotation(const btQuaternion& q)
210 {
211 btScalar d = q.length2();
212 btFullAssert(d != btScalar(0.0));
213 btScalar s = btScalar(2.0) / d;
214
215 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
216 __m128 vs, Q = q.get128();
217 __m128i Qi = btCastfTo128i(Q);
218 __m128 Y, Z;
219 __m128 V1, V2, V3;
220 __m128 V11, V21, V31;
221 __m128 NQ = _mm_xor_ps(Q, btvMzeroMask);
222 __m128i NQi = btCastfTo128i(NQ);
223
224 V1 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,2,3))); // Y X Z W
225 V2 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(0,0,1,3)); // -X -X Y W
226 V3 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(2,1,0,3))); // Z Y X W
227 V1 = _mm_xor_ps(V1, vMPPP); // change the sign of the first element
228
229 V11 = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,1,0,3))); // Y Y X W
230 V21 = _mm_unpackhi_ps(Q, Q); // Z Z W W
231 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(0,2,0,3)); // X Z -X -W
232
233 V2 = V2 * V1; //
234 V1 = V1 * V11; //
235 V3 = V3 * V31; //
236
237 V11 = _mm_shuffle_ps(NQ, Q, BT_SHUFFLE(2,3,1,3)); // -Z -W Y W
238 V11 = V11 * V21; //
239 V21 = _mm_xor_ps(V21, vMPPP); // change the sign of the first element
240 V31 = _mm_shuffle_ps(Q, NQ, BT_SHUFFLE(3,3,1,3)); // W W -Y -W
241 V31 = _mm_xor_ps(V31, vMPPP); // change the sign of the first element
242 Y = btCastiTo128f(_mm_shuffle_epi32 (NQi, BT_SHUFFLE(3,2,0,3))); // -W -Z -X -W
243 Z = btCastiTo128f(_mm_shuffle_epi32 (Qi, BT_SHUFFLE(1,0,1,3))); // Y X Y W
244
245 vs = _mm_load_ss(&s);
246 V21 = V21 * Y;
247 V31 = V31 * Z;
248
249 V1 = V1 + V11;
250 V2 = V2 + V21;
251 V3 = V3 + V31;
252
253 vs = bt_splat3_ps(vs, 0);
254 // s ready
255 V1 = V1 * vs;
256 V2 = V2 * vs;
257 V3 = V3 * vs;
258
259 V1 = V1 + v1000;
260 V2 = V2 + v0100;
261 V3 = V3 + v0010;
262
263 m_el[0] = V1;
264 m_el[1] = V2;
265 m_el[2] = V3;
266 #else
267 btScalar xs = q.x() * s, ys = q.y() * s, zs = q.z() * s;
268 btScalar wx = q.w() * xs, wy = q.w() * ys, wz = q.w() * zs;
269 btScalar xx = q.x() * xs, xy = q.x() * ys, xz = q.x() * zs;
270 btScalar yy = q.y() * ys, yz = q.y() * zs, zz = q.z() * zs;
271 setValue(
272 btScalar(1.0) - (yy + zz), xy - wz, xz + wy,
273 xy + wz, btScalar(1.0) - (xx + zz), yz - wx,
274 xz - wy, yz + wx, btScalar(1.0) - (xx + yy));
275 #endif
276 }
277
278
279 /** @brief Set the matrix from euler angles using YPR around YXZ respectively
280 * @param yaw Yaw about Y axis
281 * @param pitch Pitch about X axis
282 * @param roll Roll about Z axis
283 */
284 void setEulerYPR(const btScalar& yaw, const btScalar& pitch, const btScalar& roll)
285 {
286 setEulerZYX(roll, pitch, yaw);
287 }
288
289 /** @brief Set the matrix from euler angles YPR around ZYX axes
290 * @param eulerX Roll about X axis
291 * @param eulerY Pitch around Y axis
292 * @param eulerZ Yaw aboud Z axis
293 *
294 * These angles are used to produce a rotation matrix. The euler
295 * angles are applied in ZYX order. I.e a vector is first rotated
296 * about X then Y and then Z
297 **/
298 void setEulerZYX(btScalar eulerX,btScalar eulerY,btScalar eulerZ) {
299 ///@todo proposed to reverse this since it's labeled zyx but takes arguments xyz and it will match all other parts of the code
300 btScalar ci ( btCos(eulerX));
301 btScalar cj ( btCos(eulerY));
302 btScalar ch ( btCos(eulerZ));
303 btScalar si ( btSin(eulerX));
304 btScalar sj ( btSin(eulerY));
305 btScalar sh ( btSin(eulerZ));
306 btScalar cc = ci * ch;
307 btScalar cs = ci * sh;
308 btScalar sc = si * ch;
309 btScalar ss = si * sh;
310
311 setValue(cj * ch, sj * sc - cs, sj * cc + ss,
312 cj * sh, sj * ss + cc, sj * cs - sc,
313 -sj, cj * si, cj * ci);
314 }
315
316 /**@brief Set the matrix to the identity */
317 void setIdentity()
318 {
319 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
320 m_el[0] = v1000;
321 m_el[1] = v0100;
322 m_el[2] = v0010;
323 #else
324 setValue(btScalar(1.0), btScalar(0.0), btScalar(0.0),
325 btScalar(0.0), btScalar(1.0), btScalar(0.0),
326 btScalar(0.0), btScalar(0.0), btScalar(1.0));
327 #endif
328 }
329
330 static const btMatrix3x3& getIdentity()
331 {
332 #if (defined(BT_USE_SSE_IN_API)&& defined (BT_USE_SSE)) || defined(BT_USE_NEON)
333 static const btMatrix3x3
334 identityMatrix(v1000, v0100, v0010);
335 #else
336 static const btMatrix3x3
337 identityMatrix(
338 btScalar(1.0), btScalar(0.0), btScalar(0.0),
339 btScalar(0.0), btScalar(1.0), btScalar(0.0),
340 btScalar(0.0), btScalar(0.0), btScalar(1.0));
341 #endif
342 return identityMatrix;
343 }
344
345 /**@brief Fill the rotational part of an OpenGL matrix and clear the shear/perspective
346 * @param m The array to be filled */
347 void getOpenGLSubMatrix(btScalar *m) const
348 {
349 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
350 __m128 v0 = m_el[0].mVec128;
351 __m128 v1 = m_el[1].mVec128;
352 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
353 __m128 *vm = (__m128 *)m;
354 __m128 vT;
355
356 v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
357
358 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
359 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
360
361 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0
362 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0
363 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
364
365 vm[0] = v0;
366 vm[1] = v1;
367 vm[2] = v2;
368 #elif defined(BT_USE_NEON)
369 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
370 static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
371 float32x4_t *vm = (float32x4_t *)m;
372 float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
373 float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
374 float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
375 float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
376 float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
377 float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0
378
379 vm[0] = v0;
380 vm[1] = v1;
381 vm[2] = v2;
382 #else
383 m[0] = btScalar(m_el[0].x());
384 m[1] = btScalar(m_el[1].x());
385 m[2] = btScalar(m_el[2].x());
386 m[3] = btScalar(0.0);
387 m[4] = btScalar(m_el[0].y());
388 m[5] = btScalar(m_el[1].y());
389 m[6] = btScalar(m_el[2].y());
390 m[7] = btScalar(0.0);
391 m[8] = btScalar(m_el[0].z());
392 m[9] = btScalar(m_el[1].z());
393 m[10] = btScalar(m_el[2].z());
394 m[11] = btScalar(0.0);
395 #endif
396 }
397
398 /**@brief Get the matrix represented as a quaternion
399 * @param q The quaternion which will be set */
400 void getRotation(btQuaternion& q) const
401 {
402 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
403 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
404 btScalar s, x;
405
406 union {
407 btSimdFloat4 vec;
408 btScalar f[4];
409 } temp;
410
411 if (trace > btScalar(0.0))
412 {
413 x = trace + btScalar(1.0);
414
415 temp.f[0]=m_el[2].y() - m_el[1].z();
416 temp.f[1]=m_el[0].z() - m_el[2].x();
417 temp.f[2]=m_el[1].x() - m_el[0].y();
418 temp.f[3]=x;
419 //temp.f[3]= s * btScalar(0.5);
420 }
421 else
422 {
423 int i, j, k;
424 if(m_el[0].x() < m_el[1].y())
425 {
426 if( m_el[1].y() < m_el[2].z() )
427 { i = 2; j = 0; k = 1; }
428 else
429 { i = 1; j = 2; k = 0; }
430 }
431 else
432 {
433 if( m_el[0].x() < m_el[2].z())
434 { i = 2; j = 0; k = 1; }
435 else
436 { i = 0; j = 1; k = 2; }
437 }
438
439 x = m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0);
440
441 temp.f[3] = (m_el[k][j] - m_el[j][k]);
442 temp.f[j] = (m_el[j][i] + m_el[i][j]);
443 temp.f[k] = (m_el[k][i] + m_el[i][k]);
444 temp.f[i] = x;
445 //temp.f[i] = s * btScalar(0.5);
446 }
447
448 s = btSqrt(x);
449 q.set128(temp.vec);
450 s = btScalar(0.5) / s;
451
452 q *= s;
453 #else
454 btScalar trace = m_el[0].x() + m_el[1].y() + m_el[2].z();
455
456 btScalar temp[4];
457
458 if (trace > btScalar(0.0))
459 {
460 btScalar s = btSqrt(trace + btScalar(1.0));
461 temp[3]=(s * btScalar(0.5));
462 s = btScalar(0.5) / s;
463
464 temp[0]=((m_el[2].y() - m_el[1].z()) * s);
465 temp[1]=((m_el[0].z() - m_el[2].x()) * s);
466 temp[2]=((m_el[1].x() - m_el[0].y()) * s);
467 }
468 else
469 {
470 int i = m_el[0].x() < m_el[1].y() ?
471 (m_el[1].y() < m_el[2].z() ? 2 : 1) :
472 (m_el[0].x() < m_el[2].z() ? 2 : 0);
473 int j = (i + 1) % 3;
474 int k = (i + 2) % 3;
475
476 btScalar s = btSqrt(m_el[i][i] - m_el[j][j] - m_el[k][k] + btScalar(1.0));
477 temp[i] = s * btScalar(0.5);
478 s = btScalar(0.5) / s;
479
480 temp[3] = (m_el[k][j] - m_el[j][k]) * s;
481 temp[j] = (m_el[j][i] + m_el[i][j]) * s;
482 temp[k] = (m_el[k][i] + m_el[i][k]) * s;
483 }
484 q.setValue(temp[0],temp[1],temp[2],temp[3]);
485 #endif
486 }
487
488 /**@brief Get the matrix represented as euler angles around YXZ, roundtrip with setEulerYPR
489 * @param yaw Yaw around Y axis
490 * @param pitch Pitch around X axis
491 * @param roll around Z axis */
492 void getEulerYPR(btScalar& yaw, btScalar& pitch, btScalar& roll) const
493 {
494
495 // first use the normal calculus
496 yaw = btScalar(btAtan2(m_el[1].x(), m_el[0].x()));
497 pitch = btScalar(btAsin(-m_el[2].x()));
498 roll = btScalar(btAtan2(m_el[2].y(), m_el[2].z()));
499
500 // on pitch = +/-HalfPI
501 if (btFabs(pitch)==SIMD_HALF_PI)
502 {
503 if (yaw>0)
504 yaw-=SIMD_PI;
505 else
506 yaw+=SIMD_PI;
507
508 if (roll>0)
509 roll-=SIMD_PI;
510 else
511 roll+=SIMD_PI;
512 }
513 };
514
515
516 /**@brief Get the matrix represented as euler angles around ZYX
517 * @param yaw Yaw around X axis
518 * @param pitch Pitch around Y axis
519 * @param roll around X axis
520 * @param solution_number Which solution of two possible solutions ( 1 or 2) are possible values*/
521 void getEulerZYX(btScalar& yaw, btScalar& pitch, btScalar& roll, unsigned int solution_number = 1) const
522 {
523 struct Euler
524 {
525 btScalar yaw;
526 btScalar pitch;
527 btScalar roll;
528 };
529
530 Euler euler_out;
531 Euler euler_out2; //second solution
532 //get the pointer to the raw data
533
534 // Check that pitch is not at a singularity
535 if (btFabs(m_el[2].x()) >= 1)
536 {
537 euler_out.yaw = 0;
538 euler_out2.yaw = 0;
539
540 // From difference of angles formula
541 btScalar delta = btAtan2(m_el[0].x(),m_el[0].z());
542 if (m_el[2].x() > 0) //gimbal locked up
543 {
544 euler_out.pitch = SIMD_PI / btScalar(2.0);
545 euler_out2.pitch = SIMD_PI / btScalar(2.0);
546 euler_out.roll = euler_out.pitch + delta;
547 euler_out2.roll = euler_out.pitch + delta;
548 }
549 else // gimbal locked down
550 {
551 euler_out.pitch = -SIMD_PI / btScalar(2.0);
552 euler_out2.pitch = -SIMD_PI / btScalar(2.0);
553 euler_out.roll = -euler_out.pitch + delta;
554 euler_out2.roll = -euler_out.pitch + delta;
555 }
556 }
557 else
558 {
559 euler_out.pitch = - btAsin(m_el[2].x());
560 euler_out2.pitch = SIMD_PI - euler_out.pitch;
561
562 euler_out.roll = btAtan2(m_el[2].y()/btCos(euler_out.pitch),
563 m_el[2].z()/btCos(euler_out.pitch));
564 euler_out2.roll = btAtan2(m_el[2].y()/btCos(euler_out2.pitch),
565 m_el[2].z()/btCos(euler_out2.pitch));
566
567 euler_out.yaw = btAtan2(m_el[1].x()/btCos(euler_out.pitch),
568 m_el[0].x()/btCos(euler_out.pitch));
569 euler_out2.yaw = btAtan2(m_el[1].x()/btCos(euler_out2.pitch),
570 m_el[0].x()/btCos(euler_out2.pitch));
571 }
572
573 if (solution_number == 1)
574 {
575 yaw = euler_out.yaw;
576 pitch = euler_out.pitch;
577 roll = euler_out.roll;
578 }
579 else
580 {
581 yaw = euler_out2.yaw;
582 pitch = euler_out2.pitch;
583 roll = euler_out2.roll;
584 }
585 }
586
587 /**@brief Create a scaled copy of the matrix
588 * @param s Scaling vector The elements of the vector will scale each column */
589
590 btMatrix3x3 scaled(const btVector3& s) const
591 {
592 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
593 return btMatrix3x3(m_el[0] * s, m_el[1] * s, m_el[2] * s);
594 #else
595 return btMatrix3x3(
596 m_el[0].x() * s.x(), m_el[0].y() * s.y(), m_el[0].z() * s.z(),
597 m_el[1].x() * s.x(), m_el[1].y() * s.y(), m_el[1].z() * s.z(),
598 m_el[2].x() * s.x(), m_el[2].y() * s.y(), m_el[2].z() * s.z());
599 #endif
600 }
601
602 /**@brief Return the determinant of the matrix */
603 btScalar determinant() const;
604 /**@brief Return the adjoint of the matrix */
605 btMatrix3x3 adjoint() const;
606 /**@brief Return the matrix with all values non negative */
607 btMatrix3x3 absolute() const;
608 /**@brief Return the transpose of the matrix */
609 btMatrix3x3 transpose() const;
610 /**@brief Return the inverse of the matrix */
611 btMatrix3x3 inverse() const;
612
613 /// Solve A * x = b, where b is a column vector. This is more efficient
614 /// than computing the inverse in one-shot cases.
615 ///Solve33 is from Box2d, thanks to Erin Catto,
616 btVector3 solve33(const btVector3& b) const
617 {
618 btVector3 col1 = getColumn(0);
619 btVector3 col2 = getColumn(1);
620 btVector3 col3 = getColumn(2);
621
622 btScalar det = btDot(col1, btCross(col2, col3));
623 if (btFabs(det)>SIMD_EPSILON)
624 {
625 det = 1.0f / det;
626 }
627 btVector3 x;
628 x[0] = det * btDot(b, btCross(col2, col3));
629 x[1] = det * btDot(col1, btCross(b, col3));
630 x[2] = det * btDot(col1, btCross(col2, b));
631 return x;
632 }
633
634 btMatrix3x3 transposeTimes(const btMatrix3x3& m) const;
635 btMatrix3x3 timesTranspose(const btMatrix3x3& m) const;
636
637 SIMD_FORCE_INLINE btScalar tdotx(const btVector3& v) const
638 {
639 return m_el[0].x() * v.x() + m_el[1].x() * v.y() + m_el[2].x() * v.z();
640 }
641 SIMD_FORCE_INLINE btScalar tdoty(const btVector3& v) const
642 {
643 return m_el[0].y() * v.x() + m_el[1].y() * v.y() + m_el[2].y() * v.z();
644 }
645 SIMD_FORCE_INLINE btScalar tdotz(const btVector3& v) const
646 {
647 return m_el[0].z() * v.x() + m_el[1].z() * v.y() + m_el[2].z() * v.z();
648 }
649
650
651 /**@brief diagonalizes this matrix by the Jacobi method.
652 * @param rot stores the rotation from the coordinate system in which the matrix is diagonal to the original
653 * coordinate system, i.e., old_this = rot * new_this * rot^T.
654 * @param threshold See iteration
655 * @param iteration The iteration stops when all off-diagonal elements are less than the threshold multiplied
656 * by the sum of the absolute values of the diagonal, or when maxSteps have been executed.
657 *
658 * Note that this matrix is assumed to be symmetric.
659 */
660 void diagonalize(btMatrix3x3& rot, btScalar threshold, int maxSteps)
661 {
662 rot.setIdentity();
663 for (int step = maxSteps; step > 0; step--)
664 {
665 // find off-diagonal element [p][q] with largest magnitude
666 int p = 0;
667 int q = 1;
668 int r = 2;
669 btScalar max = btFabs(m_el[0][1]);
670 btScalar v = btFabs(m_el[0][2]);
671 if (v > max)
672 {
673 q = 2;
674 r = 1;
675 max = v;
676 }
677 v = btFabs(m_el[1][2]);
678 if (v > max)
679 {
680 p = 1;
681 q = 2;
682 r = 0;
683 max = v;
684 }
685
686 btScalar t = threshold * (btFabs(m_el[0][0]) + btFabs(m_el[1][1]) + btFabs(m_el[2][2]));
687 if (max <= t)
688 {
689 if (max <= SIMD_EPSILON * t)
690 {
691 return;
692 }
693 step = 1;
694 }
695
696 // compute Jacobi rotation J which leads to a zero for element [p][q]
697 btScalar mpq = m_el[p][q];
698 btScalar theta = (m_el[q][q] - m_el[p][p]) / (2 * mpq);
699 btScalar theta2 = theta * theta;
700 btScalar cos;
701 btScalar sin;
702 if (theta2 * theta2 < btScalar(10 / SIMD_EPSILON))
703 {
704 t = (theta >= 0) ? 1 / (theta + btSqrt(1 + theta2))
705 : 1 / (theta - btSqrt(1 + theta2));
706 cos = 1 / btSqrt(1 + t * t);
707 sin = cos * t;
708 }
709 else
710 {
711 // approximation for large theta-value, i.e., a nearly diagonal matrix
712 t = 1 / (theta * (2 + btScalar(0.5) / theta2));
713 cos = 1 - btScalar(0.5) * t * t;
714 sin = cos * t;
715 }
716
717 // apply rotation to matrix (this = J^T * this * J)
718 m_el[p][q] = m_el[q][p] = 0;
719 m_el[p][p] -= t * mpq;
720 m_el[q][q] += t * mpq;
721 btScalar mrp = m_el[r][p];
722 btScalar mrq = m_el[r][q];
723 m_el[r][p] = m_el[p][r] = cos * mrp - sin * mrq;
724 m_el[r][q] = m_el[q][r] = cos * mrq + sin * mrp;
725
726 // apply rotation to rot (rot = rot * J)
727 for (int i = 0; i < 3; i++)
728 {
729 btVector3& row = rot[i];
730 mrp = row[p];
731 mrq = row[q];
732 row[p] = cos * mrp - sin * mrq;
733 row[q] = cos * mrq + sin * mrp;
734 }
735 }
736 }
737
738
739
740
741 /**@brief Calculate the matrix cofactor
742 * @param r1 The first row to use for calculating the cofactor
743 * @param c1 The first column to use for calculating the cofactor
744 * @param r1 The second row to use for calculating the cofactor
745 * @param c1 The second column to use for calculating the cofactor
746 * See http://en.wikipedia.org/wiki/Cofactor_(linear_algebra) for more details
747 */
748 btScalar cofac(int r1, int c1, int r2, int c2) const
749 {
750 return m_el[r1][c1] * m_el[r2][c2] - m_el[r1][c2] * m_el[r2][c1];
751 }
752
753 void serialize(struct btMatrix3x3Data& dataOut) const;
754
755 void serializeFloat(struct btMatrix3x3FloatData& dataOut) const;
756
757 void deSerialize(const struct btMatrix3x3Data& dataIn);
758
759 void deSerializeFloat(const struct btMatrix3x3FloatData& dataIn);
760
761 void deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn);
762
763 };
764
765
766 SIMD_FORCE_INLINE btMatrix3x3&
767 btMatrix3x3::operator*=(const btMatrix3x3& m)
768 {
769 #if defined BT_USE_SIMD_VECTOR3 && defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE)
770 __m128 rv00, rv01, rv02;
771 __m128 rv10, rv11, rv12;
772 __m128 rv20, rv21, rv22;
773 __m128 mv0, mv1, mv2;
774
775 rv02 = m_el[0].mVec128;
776 rv12 = m_el[1].mVec128;
777 rv22 = m_el[2].mVec128;
778
779 mv0 = _mm_and_ps(m[0].mVec128, btvFFF0fMask);
780 mv1 = _mm_and_ps(m[1].mVec128, btvFFF0fMask);
781 mv2 = _mm_and_ps(m[2].mVec128, btvFFF0fMask);
782
783 // rv0
784 rv00 = bt_splat_ps(rv02, 0);
785 rv01 = bt_splat_ps(rv02, 1);
786 rv02 = bt_splat_ps(rv02, 2);
787
788 rv00 = _mm_mul_ps(rv00, mv0);
789 rv01 = _mm_mul_ps(rv01, mv1);
790 rv02 = _mm_mul_ps(rv02, mv2);
791
792 // rv1
793 rv10 = bt_splat_ps(rv12, 0);
794 rv11 = bt_splat_ps(rv12, 1);
795 rv12 = bt_splat_ps(rv12, 2);
796
797 rv10 = _mm_mul_ps(rv10, mv0);
798 rv11 = _mm_mul_ps(rv11, mv1);
799 rv12 = _mm_mul_ps(rv12, mv2);
800
801 // rv2
802 rv20 = bt_splat_ps(rv22, 0);
803 rv21 = bt_splat_ps(rv22, 1);
804 rv22 = bt_splat_ps(rv22, 2);
805
806 rv20 = _mm_mul_ps(rv20, mv0);
807 rv21 = _mm_mul_ps(rv21, mv1);
808 rv22 = _mm_mul_ps(rv22, mv2);
809
810 rv00 = _mm_add_ps(rv00, rv01);
811 rv10 = _mm_add_ps(rv10, rv11);
812 rv20 = _mm_add_ps(rv20, rv21);
813
814 m_el[0].mVec128 = _mm_add_ps(rv00, rv02);
815 m_el[1].mVec128 = _mm_add_ps(rv10, rv12);
816 m_el[2].mVec128 = _mm_add_ps(rv20, rv22);
817
818 #elif defined(BT_USE_NEON)
819
820 float32x4_t rv0, rv1, rv2;
821 float32x4_t v0, v1, v2;
822 float32x4_t mv0, mv1, mv2;
823
824 v0 = m_el[0].mVec128;
825 v1 = m_el[1].mVec128;
826 v2 = m_el[2].mVec128;
827
828 mv0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
829 mv1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
830 mv2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
831
832 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
833 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
834 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
835
836 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
837 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
838 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
839
840 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
841 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
842 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
843
844 m_el[0].mVec128 = rv0;
845 m_el[1].mVec128 = rv1;
846 m_el[2].mVec128 = rv2;
847 #else
848 setValue(
849 m.tdotx(m_el[0]), m.tdoty(m_el[0]), m.tdotz(m_el[0]),
850 m.tdotx(m_el[1]), m.tdoty(m_el[1]), m.tdotz(m_el[1]),
851 m.tdotx(m_el[2]), m.tdoty(m_el[2]), m.tdotz(m_el[2]));
852 #endif
853 return *this;
854 }
855
856 SIMD_FORCE_INLINE btMatrix3x3&
857 btMatrix3x3::operator+=(const btMatrix3x3& m)
858 {
859 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
860 m_el[0].mVec128 = m_el[0].mVec128 + m.m_el[0].mVec128;
861 m_el[1].mVec128 = m_el[1].mVec128 + m.m_el[1].mVec128;
862 m_el[2].mVec128 = m_el[2].mVec128 + m.m_el[2].mVec128;
863 #else
864 setValue(
865 m_el[0][0]+m.m_el[0][0],
866 m_el[0][1]+m.m_el[0][1],
867 m_el[0][2]+m.m_el[0][2],
868 m_el[1][0]+m.m_el[1][0],
869 m_el[1][1]+m.m_el[1][1],
870 m_el[1][2]+m.m_el[1][2],
871 m_el[2][0]+m.m_el[2][0],
872 m_el[2][1]+m.m_el[2][1],
873 m_el[2][2]+m.m_el[2][2]);
874 #endif
875 return *this;
876 }
877
878 SIMD_FORCE_INLINE btMatrix3x3
879 operator*(const btMatrix3x3& m, const btScalar & k)
880 {
881 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
882 __m128 vk = bt_splat_ps(_mm_load_ss((float *)&k), 0x80);
883 return btMatrix3x3(
884 _mm_mul_ps(m[0].mVec128, vk),
885 _mm_mul_ps(m[1].mVec128, vk),
886 _mm_mul_ps(m[2].mVec128, vk));
887 #elif defined(BT_USE_NEON)
888 return btMatrix3x3(
889 vmulq_n_f32(m[0].mVec128, k),
890 vmulq_n_f32(m[1].mVec128, k),
891 vmulq_n_f32(m[2].mVec128, k));
892 #else
893 return btMatrix3x3(
894 m[0].x()*k,m[0].y()*k,m[0].z()*k,
895 m[1].x()*k,m[1].y()*k,m[1].z()*k,
896 m[2].x()*k,m[2].y()*k,m[2].z()*k);
897 #endif
898 }
899
900 SIMD_FORCE_INLINE btMatrix3x3
901 operator+(const btMatrix3x3& m1, const btMatrix3x3& m2)
902 {
903 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
904 return btMatrix3x3(
905 m1[0].mVec128 + m2[0].mVec128,
906 m1[1].mVec128 + m2[1].mVec128,
907 m1[2].mVec128 + m2[2].mVec128);
908 #else
909 return btMatrix3x3(
910 m1[0][0]+m2[0][0],
911 m1[0][1]+m2[0][1],
912 m1[0][2]+m2[0][2],
913
914 m1[1][0]+m2[1][0],
915 m1[1][1]+m2[1][1],
916 m1[1][2]+m2[1][2],
917
918 m1[2][0]+m2[2][0],
919 m1[2][1]+m2[2][1],
920 m1[2][2]+m2[2][2]);
921 #endif
922 }
923
924 SIMD_FORCE_INLINE btMatrix3x3
925 operator-(const btMatrix3x3& m1, const btMatrix3x3& m2)
926 {
927 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
928 return btMatrix3x3(
929 m1[0].mVec128 - m2[0].mVec128,
930 m1[1].mVec128 - m2[1].mVec128,
931 m1[2].mVec128 - m2[2].mVec128);
932 #else
933 return btMatrix3x3(
934 m1[0][0]-m2[0][0],
935 m1[0][1]-m2[0][1],
936 m1[0][2]-m2[0][2],
937
938 m1[1][0]-m2[1][0],
939 m1[1][1]-m2[1][1],
940 m1[1][2]-m2[1][2],
941
942 m1[2][0]-m2[2][0],
943 m1[2][1]-m2[2][1],
944 m1[2][2]-m2[2][2]);
945 #endif
946 }
947
948
949 SIMD_FORCE_INLINE btMatrix3x3&
950 btMatrix3x3::operator-=(const btMatrix3x3& m)
951 {
952 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
953 m_el[0].mVec128 = m_el[0].mVec128 - m.m_el[0].mVec128;
954 m_el[1].mVec128 = m_el[1].mVec128 - m.m_el[1].mVec128;
955 m_el[2].mVec128 = m_el[2].mVec128 - m.m_el[2].mVec128;
956 #else
957 setValue(
958 m_el[0][0]-m.m_el[0][0],
959 m_el[0][1]-m.m_el[0][1],
960 m_el[0][2]-m.m_el[0][2],
961 m_el[1][0]-m.m_el[1][0],
962 m_el[1][1]-m.m_el[1][1],
963 m_el[1][2]-m.m_el[1][2],
964 m_el[2][0]-m.m_el[2][0],
965 m_el[2][1]-m.m_el[2][1],
966 m_el[2][2]-m.m_el[2][2]);
967 #endif
968 return *this;
969 }
970
971
972 SIMD_FORCE_INLINE btScalar
determinant()973 btMatrix3x3::determinant() const
974 {
975 return btTriple((*this)[0], (*this)[1], (*this)[2]);
976 }
977
978
979 SIMD_FORCE_INLINE btMatrix3x3
absolute()980 btMatrix3x3::absolute() const
981 {
982 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
983 return btMatrix3x3(
984 _mm_and_ps(m_el[0].mVec128, btvAbsfMask),
985 _mm_and_ps(m_el[1].mVec128, btvAbsfMask),
986 _mm_and_ps(m_el[2].mVec128, btvAbsfMask));
987 #elif defined(BT_USE_NEON)
988 return btMatrix3x3(
989 (float32x4_t)vandq_s32((int32x4_t)m_el[0].mVec128, btv3AbsMask),
990 (float32x4_t)vandq_s32((int32x4_t)m_el[1].mVec128, btv3AbsMask),
991 (float32x4_t)vandq_s32((int32x4_t)m_el[2].mVec128, btv3AbsMask));
992 #else
993 return btMatrix3x3(
994 btFabs(m_el[0].x()), btFabs(m_el[0].y()), btFabs(m_el[0].z()),
995 btFabs(m_el[1].x()), btFabs(m_el[1].y()), btFabs(m_el[1].z()),
996 btFabs(m_el[2].x()), btFabs(m_el[2].y()), btFabs(m_el[2].z()));
997 #endif
998 }
999
1000 SIMD_FORCE_INLINE btMatrix3x3
transpose()1001 btMatrix3x3::transpose() const
1002 {
1003 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1004 __m128 v0 = m_el[0].mVec128;
1005 __m128 v1 = m_el[1].mVec128;
1006 __m128 v2 = m_el[2].mVec128; // x2 y2 z2 w2
1007 __m128 vT;
1008
1009 v2 = _mm_and_ps(v2, btvFFF0fMask); // x2 y2 z2 0
1010
1011 vT = _mm_unpackhi_ps(v0, v1); // z0 z1 * *
1012 v0 = _mm_unpacklo_ps(v0, v1); // x0 x1 y0 y1
1013
1014 v1 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(2, 3, 1, 3) ); // y0 y1 y2 0
1015 v0 = _mm_shuffle_ps(v0, v2, BT_SHUFFLE(0, 1, 0, 3) ); // x0 x1 x2 0
1016 v2 = btCastdTo128f(_mm_move_sd(btCastfTo128d(v2), btCastfTo128d(vT))); // z0 z1 z2 0
1017
1018
1019 return btMatrix3x3( v0, v1, v2 );
1020 #elif defined(BT_USE_NEON)
1021 // note: zeros the w channel. We can preserve it at the cost of two more vtrn instructions.
1022 static const uint32x2_t zMask = (const uint32x2_t) {static_cast<uint32_t>(-1), 0 };
1023 float32x4x2_t top = vtrnq_f32( m_el[0].mVec128, m_el[1].mVec128 ); // {x0 x1 z0 z1}, {y0 y1 w0 w1}
1024 float32x2x2_t bl = vtrn_f32( vget_low_f32(m_el[2].mVec128), vdup_n_f32(0.0f) ); // {x2 0 }, {y2 0}
1025 float32x4_t v0 = vcombine_f32( vget_low_f32(top.val[0]), bl.val[0] );
1026 float32x4_t v1 = vcombine_f32( vget_low_f32(top.val[1]), bl.val[1] );
1027 float32x2_t q = (float32x2_t) vand_u32( (uint32x2_t) vget_high_f32( m_el[2].mVec128), zMask );
1028 float32x4_t v2 = vcombine_f32( vget_high_f32(top.val[0]), q ); // z0 z1 z2 0
1029 return btMatrix3x3( v0, v1, v2 );
1030 #else
1031 return btMatrix3x3( m_el[0].x(), m_el[1].x(), m_el[2].x(),
1032 m_el[0].y(), m_el[1].y(), m_el[2].y(),
1033 m_el[0].z(), m_el[1].z(), m_el[2].z());
1034 #endif
1035 }
1036
1037 SIMD_FORCE_INLINE btMatrix3x3
adjoint()1038 btMatrix3x3::adjoint() const
1039 {
1040 return btMatrix3x3(cofac(1, 1, 2, 2), cofac(0, 2, 2, 1), cofac(0, 1, 1, 2),
1041 cofac(1, 2, 2, 0), cofac(0, 0, 2, 2), cofac(0, 2, 1, 0),
1042 cofac(1, 0, 2, 1), cofac(0, 1, 2, 0), cofac(0, 0, 1, 1));
1043 }
1044
1045 SIMD_FORCE_INLINE btMatrix3x3
inverse()1046 btMatrix3x3::inverse() const
1047 {
1048 btVector3 co(cofac(1, 1, 2, 2), cofac(1, 2, 2, 0), cofac(1, 0, 2, 1));
1049 btScalar det = (*this)[0].dot(co);
1050 //btFullAssert(det != btScalar(0.0));
1051 btAssert(det != btScalar(0.0));
1052 btScalar s = btScalar(1.0) / det;
1053 return btMatrix3x3(co.x() * s, cofac(0, 2, 2, 1) * s, cofac(0, 1, 1, 2) * s,
1054 co.y() * s, cofac(0, 0, 2, 2) * s, cofac(0, 2, 1, 0) * s,
1055 co.z() * s, cofac(0, 1, 2, 0) * s, cofac(0, 0, 1, 1) * s);
1056 }
1057
1058 SIMD_FORCE_INLINE btMatrix3x3
transposeTimes(const btMatrix3x3 & m)1059 btMatrix3x3::transposeTimes(const btMatrix3x3& m) const
1060 {
1061 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1062 // zeros w
1063 // static const __m128i xyzMask = (const __m128i){ -1ULL, 0xffffffffULL };
1064 __m128 row = m_el[0].mVec128;
1065 __m128 m0 = _mm_and_ps( m.getRow(0).mVec128, btvFFF0fMask );
1066 __m128 m1 = _mm_and_ps( m.getRow(1).mVec128, btvFFF0fMask);
1067 __m128 m2 = _mm_and_ps( m.getRow(2).mVec128, btvFFF0fMask );
1068 __m128 r0 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0));
1069 __m128 r1 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0x55));
1070 __m128 r2 = _mm_mul_ps(m0, _mm_shuffle_ps(row, row, 0xaa));
1071 row = m_el[1].mVec128;
1072 r0 = _mm_add_ps( r0, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0)));
1073 r1 = _mm_add_ps( r1, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0x55)));
1074 r2 = _mm_add_ps( r2, _mm_mul_ps(m1, _mm_shuffle_ps(row, row, 0xaa)));
1075 row = m_el[2].mVec128;
1076 r0 = _mm_add_ps( r0, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0)));
1077 r1 = _mm_add_ps( r1, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0x55)));
1078 r2 = _mm_add_ps( r2, _mm_mul_ps(m2, _mm_shuffle_ps(row, row, 0xaa)));
1079 return btMatrix3x3( r0, r1, r2 );
1080
1081 #elif defined BT_USE_NEON
1082 // zeros w
1083 static const uint32x4_t xyzMask = (const uint32x4_t){ static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), static_cast<uint32_t>(-1), 0 };
1084 float32x4_t m0 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(0).mVec128, xyzMask );
1085 float32x4_t m1 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(1).mVec128, xyzMask );
1086 float32x4_t m2 = (float32x4_t) vandq_u32( (uint32x4_t) m.getRow(2).mVec128, xyzMask );
1087 float32x4_t row = m_el[0].mVec128;
1088 float32x4_t r0 = vmulq_lane_f32( m0, vget_low_f32(row), 0);
1089 float32x4_t r1 = vmulq_lane_f32( m0, vget_low_f32(row), 1);
1090 float32x4_t r2 = vmulq_lane_f32( m0, vget_high_f32(row), 0);
1091 row = m_el[1].mVec128;
1092 r0 = vmlaq_lane_f32( r0, m1, vget_low_f32(row), 0);
1093 r1 = vmlaq_lane_f32( r1, m1, vget_low_f32(row), 1);
1094 r2 = vmlaq_lane_f32( r2, m1, vget_high_f32(row), 0);
1095 row = m_el[2].mVec128;
1096 r0 = vmlaq_lane_f32( r0, m2, vget_low_f32(row), 0);
1097 r1 = vmlaq_lane_f32( r1, m2, vget_low_f32(row), 1);
1098 r2 = vmlaq_lane_f32( r2, m2, vget_high_f32(row), 0);
1099 return btMatrix3x3( r0, r1, r2 );
1100 #else
1101 return btMatrix3x3(
1102 m_el[0].x() * m[0].x() + m_el[1].x() * m[1].x() + m_el[2].x() * m[2].x(),
1103 m_el[0].x() * m[0].y() + m_el[1].x() * m[1].y() + m_el[2].x() * m[2].y(),
1104 m_el[0].x() * m[0].z() + m_el[1].x() * m[1].z() + m_el[2].x() * m[2].z(),
1105 m_el[0].y() * m[0].x() + m_el[1].y() * m[1].x() + m_el[2].y() * m[2].x(),
1106 m_el[0].y() * m[0].y() + m_el[1].y() * m[1].y() + m_el[2].y() * m[2].y(),
1107 m_el[0].y() * m[0].z() + m_el[1].y() * m[1].z() + m_el[2].y() * m[2].z(),
1108 m_el[0].z() * m[0].x() + m_el[1].z() * m[1].x() + m_el[2].z() * m[2].x(),
1109 m_el[0].z() * m[0].y() + m_el[1].z() * m[1].y() + m_el[2].z() * m[2].y(),
1110 m_el[0].z() * m[0].z() + m_el[1].z() * m[1].z() + m_el[2].z() * m[2].z());
1111 #endif
1112 }
1113
1114 SIMD_FORCE_INLINE btMatrix3x3
timesTranspose(const btMatrix3x3 & m)1115 btMatrix3x3::timesTranspose(const btMatrix3x3& m) const
1116 {
1117 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1118 __m128 a0 = m_el[0].mVec128;
1119 __m128 a1 = m_el[1].mVec128;
1120 __m128 a2 = m_el[2].mVec128;
1121
1122 btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1123 __m128 mx = mT[0].mVec128;
1124 __m128 my = mT[1].mVec128;
1125 __m128 mz = mT[2].mVec128;
1126
1127 __m128 r0 = _mm_mul_ps(mx, _mm_shuffle_ps(a0, a0, 0x00));
1128 __m128 r1 = _mm_mul_ps(mx, _mm_shuffle_ps(a1, a1, 0x00));
1129 __m128 r2 = _mm_mul_ps(mx, _mm_shuffle_ps(a2, a2, 0x00));
1130 r0 = _mm_add_ps(r0, _mm_mul_ps(my, _mm_shuffle_ps(a0, a0, 0x55)));
1131 r1 = _mm_add_ps(r1, _mm_mul_ps(my, _mm_shuffle_ps(a1, a1, 0x55)));
1132 r2 = _mm_add_ps(r2, _mm_mul_ps(my, _mm_shuffle_ps(a2, a2, 0x55)));
1133 r0 = _mm_add_ps(r0, _mm_mul_ps(mz, _mm_shuffle_ps(a0, a0, 0xaa)));
1134 r1 = _mm_add_ps(r1, _mm_mul_ps(mz, _mm_shuffle_ps(a1, a1, 0xaa)));
1135 r2 = _mm_add_ps(r2, _mm_mul_ps(mz, _mm_shuffle_ps(a2, a2, 0xaa)));
1136 return btMatrix3x3( r0, r1, r2);
1137
1138 #elif defined BT_USE_NEON
1139 float32x4_t a0 = m_el[0].mVec128;
1140 float32x4_t a1 = m_el[1].mVec128;
1141 float32x4_t a2 = m_el[2].mVec128;
1142
1143 btMatrix3x3 mT = m.transpose(); // we rely on transpose() zeroing w channel so that we don't have to do it here
1144 float32x4_t mx = mT[0].mVec128;
1145 float32x4_t my = mT[1].mVec128;
1146 float32x4_t mz = mT[2].mVec128;
1147
1148 float32x4_t r0 = vmulq_lane_f32( mx, vget_low_f32(a0), 0);
1149 float32x4_t r1 = vmulq_lane_f32( mx, vget_low_f32(a1), 0);
1150 float32x4_t r2 = vmulq_lane_f32( mx, vget_low_f32(a2), 0);
1151 r0 = vmlaq_lane_f32( r0, my, vget_low_f32(a0), 1);
1152 r1 = vmlaq_lane_f32( r1, my, vget_low_f32(a1), 1);
1153 r2 = vmlaq_lane_f32( r2, my, vget_low_f32(a2), 1);
1154 r0 = vmlaq_lane_f32( r0, mz, vget_high_f32(a0), 0);
1155 r1 = vmlaq_lane_f32( r1, mz, vget_high_f32(a1), 0);
1156 r2 = vmlaq_lane_f32( r2, mz, vget_high_f32(a2), 0);
1157 return btMatrix3x3( r0, r1, r2 );
1158
1159 #else
1160 return btMatrix3x3(
1161 m_el[0].dot(m[0]), m_el[0].dot(m[1]), m_el[0].dot(m[2]),
1162 m_el[1].dot(m[0]), m_el[1].dot(m[1]), m_el[1].dot(m[2]),
1163 m_el[2].dot(m[0]), m_el[2].dot(m[1]), m_el[2].dot(m[2]));
1164 #endif
1165 }
1166
1167 SIMD_FORCE_INLINE btVector3
1168 operator*(const btMatrix3x3& m, const btVector3& v)
1169 {
1170 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))|| defined (BT_USE_NEON)
1171 return v.dot3(m[0], m[1], m[2]);
1172 #else
1173 return btVector3(m[0].dot(v), m[1].dot(v), m[2].dot(v));
1174 #endif
1175 }
1176
1177
1178 SIMD_FORCE_INLINE btVector3
1179 operator*(const btVector3& v, const btMatrix3x3& m)
1180 {
1181 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1182
1183 const __m128 vv = v.mVec128;
1184
1185 __m128 c0 = bt_splat_ps( vv, 0);
1186 __m128 c1 = bt_splat_ps( vv, 1);
1187 __m128 c2 = bt_splat_ps( vv, 2);
1188
1189 c0 = _mm_mul_ps(c0, _mm_and_ps(m[0].mVec128, btvFFF0fMask) );
1190 c1 = _mm_mul_ps(c1, _mm_and_ps(m[1].mVec128, btvFFF0fMask) );
1191 c0 = _mm_add_ps(c0, c1);
1192 c2 = _mm_mul_ps(c2, _mm_and_ps(m[2].mVec128, btvFFF0fMask) );
1193
1194 return btVector3(_mm_add_ps(c0, c2));
1195 #elif defined(BT_USE_NEON)
1196 const float32x4_t vv = v.mVec128;
1197 const float32x2_t vlo = vget_low_f32(vv);
1198 const float32x2_t vhi = vget_high_f32(vv);
1199
1200 float32x4_t c0, c1, c2;
1201
1202 c0 = (float32x4_t) vandq_s32((int32x4_t)m[0].mVec128, btvFFF0Mask);
1203 c1 = (float32x4_t) vandq_s32((int32x4_t)m[1].mVec128, btvFFF0Mask);
1204 c2 = (float32x4_t) vandq_s32((int32x4_t)m[2].mVec128, btvFFF0Mask);
1205
1206 c0 = vmulq_lane_f32(c0, vlo, 0);
1207 c1 = vmulq_lane_f32(c1, vlo, 1);
1208 c2 = vmulq_lane_f32(c2, vhi, 0);
1209 c0 = vaddq_f32(c0, c1);
1210 c0 = vaddq_f32(c0, c2);
1211
1212 return btVector3(c0);
1213 #else
1214 return btVector3(m.tdotx(v), m.tdoty(v), m.tdotz(v));
1215 #endif
1216 }
1217
1218 SIMD_FORCE_INLINE btMatrix3x3
1219 operator*(const btMatrix3x3& m1, const btMatrix3x3& m2)
1220 {
1221 #if defined BT_USE_SIMD_VECTOR3 && (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1222
1223 __m128 m10 = m1[0].mVec128;
1224 __m128 m11 = m1[1].mVec128;
1225 __m128 m12 = m1[2].mVec128;
1226
1227 __m128 m2v = _mm_and_ps(m2[0].mVec128, btvFFF0fMask);
1228
1229 __m128 c0 = bt_splat_ps( m10, 0);
1230 __m128 c1 = bt_splat_ps( m11, 0);
1231 __m128 c2 = bt_splat_ps( m12, 0);
1232
1233 c0 = _mm_mul_ps(c0, m2v);
1234 c1 = _mm_mul_ps(c1, m2v);
1235 c2 = _mm_mul_ps(c2, m2v);
1236
1237 m2v = _mm_and_ps(m2[1].mVec128, btvFFF0fMask);
1238
1239 __m128 c0_1 = bt_splat_ps( m10, 1);
1240 __m128 c1_1 = bt_splat_ps( m11, 1);
1241 __m128 c2_1 = bt_splat_ps( m12, 1);
1242
1243 c0_1 = _mm_mul_ps(c0_1, m2v);
1244 c1_1 = _mm_mul_ps(c1_1, m2v);
1245 c2_1 = _mm_mul_ps(c2_1, m2v);
1246
1247 m2v = _mm_and_ps(m2[2].mVec128, btvFFF0fMask);
1248
1249 c0 = _mm_add_ps(c0, c0_1);
1250 c1 = _mm_add_ps(c1, c1_1);
1251 c2 = _mm_add_ps(c2, c2_1);
1252
1253 m10 = bt_splat_ps( m10, 2);
1254 m11 = bt_splat_ps( m11, 2);
1255 m12 = bt_splat_ps( m12, 2);
1256
1257 m10 = _mm_mul_ps(m10, m2v);
1258 m11 = _mm_mul_ps(m11, m2v);
1259 m12 = _mm_mul_ps(m12, m2v);
1260
1261 c0 = _mm_add_ps(c0, m10);
1262 c1 = _mm_add_ps(c1, m11);
1263 c2 = _mm_add_ps(c2, m12);
1264
1265 return btMatrix3x3(c0, c1, c2);
1266
1267 #elif defined(BT_USE_NEON)
1268
1269 float32x4_t rv0, rv1, rv2;
1270 float32x4_t v0, v1, v2;
1271 float32x4_t mv0, mv1, mv2;
1272
1273 v0 = m1[0].mVec128;
1274 v1 = m1[1].mVec128;
1275 v2 = m1[2].mVec128;
1276
1277 mv0 = (float32x4_t) vandq_s32((int32x4_t)m2[0].mVec128, btvFFF0Mask);
1278 mv1 = (float32x4_t) vandq_s32((int32x4_t)m2[1].mVec128, btvFFF0Mask);
1279 mv2 = (float32x4_t) vandq_s32((int32x4_t)m2[2].mVec128, btvFFF0Mask);
1280
1281 rv0 = vmulq_lane_f32(mv0, vget_low_f32(v0), 0);
1282 rv1 = vmulq_lane_f32(mv0, vget_low_f32(v1), 0);
1283 rv2 = vmulq_lane_f32(mv0, vget_low_f32(v2), 0);
1284
1285 rv0 = vmlaq_lane_f32(rv0, mv1, vget_low_f32(v0), 1);
1286 rv1 = vmlaq_lane_f32(rv1, mv1, vget_low_f32(v1), 1);
1287 rv2 = vmlaq_lane_f32(rv2, mv1, vget_low_f32(v2), 1);
1288
1289 rv0 = vmlaq_lane_f32(rv0, mv2, vget_high_f32(v0), 0);
1290 rv1 = vmlaq_lane_f32(rv1, mv2, vget_high_f32(v1), 0);
1291 rv2 = vmlaq_lane_f32(rv2, mv2, vget_high_f32(v2), 0);
1292
1293 return btMatrix3x3(rv0, rv1, rv2);
1294
1295 #else
1296 return btMatrix3x3(
1297 m2.tdotx( m1[0]), m2.tdoty( m1[0]), m2.tdotz( m1[0]),
1298 m2.tdotx( m1[1]), m2.tdoty( m1[1]), m2.tdotz( m1[1]),
1299 m2.tdotx( m1[2]), m2.tdoty( m1[2]), m2.tdotz( m1[2]));
1300 #endif
1301 }
1302
1303 /*
1304 SIMD_FORCE_INLINE btMatrix3x3 btMultTransposeLeft(const btMatrix3x3& m1, const btMatrix3x3& m2) {
1305 return btMatrix3x3(
1306 m1[0][0] * m2[0][0] + m1[1][0] * m2[1][0] + m1[2][0] * m2[2][0],
1307 m1[0][0] * m2[0][1] + m1[1][0] * m2[1][1] + m1[2][0] * m2[2][1],
1308 m1[0][0] * m2[0][2] + m1[1][0] * m2[1][2] + m1[2][0] * m2[2][2],
1309 m1[0][1] * m2[0][0] + m1[1][1] * m2[1][0] + m1[2][1] * m2[2][0],
1310 m1[0][1] * m2[0][1] + m1[1][1] * m2[1][1] + m1[2][1] * m2[2][1],
1311 m1[0][1] * m2[0][2] + m1[1][1] * m2[1][2] + m1[2][1] * m2[2][2],
1312 m1[0][2] * m2[0][0] + m1[1][2] * m2[1][0] + m1[2][2] * m2[2][0],
1313 m1[0][2] * m2[0][1] + m1[1][2] * m2[1][1] + m1[2][2] * m2[2][1],
1314 m1[0][2] * m2[0][2] + m1[1][2] * m2[1][2] + m1[2][2] * m2[2][2]);
1315 }
1316 */
1317
1318 /**@brief Equality operator between two matrices
1319 * It will test all elements are equal. */
1320 SIMD_FORCE_INLINE bool operator==(const btMatrix3x3& m1, const btMatrix3x3& m2)
1321 {
1322 #if (defined (BT_USE_SSE_IN_API) && defined (BT_USE_SSE))
1323
1324 __m128 c0, c1, c2;
1325
1326 c0 = _mm_cmpeq_ps(m1[0].mVec128, m2[0].mVec128);
1327 c1 = _mm_cmpeq_ps(m1[1].mVec128, m2[1].mVec128);
1328 c2 = _mm_cmpeq_ps(m1[2].mVec128, m2[2].mVec128);
1329
1330 c0 = _mm_and_ps(c0, c1);
1331 c0 = _mm_and_ps(c0, c2);
1332
1333 int m = _mm_movemask_ps((__m128)c0);
1334 return (0x7 == (m & 0x7));
1335
1336 #else
1337 return
1338 ( m1[0][0] == m2[0][0] && m1[1][0] == m2[1][0] && m1[2][0] == m2[2][0] &&
1339 m1[0][1] == m2[0][1] && m1[1][1] == m2[1][1] && m1[2][1] == m2[2][1] &&
1340 m1[0][2] == m2[0][2] && m1[1][2] == m2[1][2] && m1[2][2] == m2[2][2] );
1341 #endif
1342 }
1343
1344 ///for serialization
1345 struct btMatrix3x3FloatData
1346 {
1347 btVector3FloatData m_el[3];
1348 };
1349
1350 ///for serialization
1351 struct btMatrix3x3DoubleData
1352 {
1353 btVector3DoubleData m_el[3];
1354 };
1355
1356
1357
1358
serialize(struct btMatrix3x3Data & dataOut)1359 SIMD_FORCE_INLINE void btMatrix3x3::serialize(struct btMatrix3x3Data& dataOut) const
1360 {
1361 for (int i=0;i<3;i++)
1362 m_el[i].serialize(dataOut.m_el[i]);
1363 }
1364
serializeFloat(struct btMatrix3x3FloatData & dataOut)1365 SIMD_FORCE_INLINE void btMatrix3x3::serializeFloat(struct btMatrix3x3FloatData& dataOut) const
1366 {
1367 for (int i=0;i<3;i++)
1368 m_el[i].serializeFloat(dataOut.m_el[i]);
1369 }
1370
1371
deSerialize(const struct btMatrix3x3Data & dataIn)1372 SIMD_FORCE_INLINE void btMatrix3x3::deSerialize(const struct btMatrix3x3Data& dataIn)
1373 {
1374 for (int i=0;i<3;i++)
1375 m_el[i].deSerialize(dataIn.m_el[i]);
1376 }
1377
deSerializeFloat(const struct btMatrix3x3FloatData & dataIn)1378 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeFloat(const struct btMatrix3x3FloatData& dataIn)
1379 {
1380 for (int i=0;i<3;i++)
1381 m_el[i].deSerializeFloat(dataIn.m_el[i]);
1382 }
1383
deSerializeDouble(const struct btMatrix3x3DoubleData & dataIn)1384 SIMD_FORCE_INLINE void btMatrix3x3::deSerializeDouble(const struct btMatrix3x3DoubleData& dataIn)
1385 {
1386 for (int i=0;i<3;i++)
1387 m_el[i].deSerializeDouble(dataIn.m_el[i]);
1388 }
1389
1390 #endif //BT_MATRIX3x3_H
1391
1392