1 /*
2 Copyright (c) 2003-2013 Gino van den Bergen / Erwin Coumans  http://bulletphysics.org
3 
4 This software is provided 'as-is', without any express or implied warranty.
5 In no event will the authors be held liable for any damages arising from the use of this software.
6 Permission is granted to anyone to use this software for any purpose,
7 including commercial applications, and to alter it and redistribute it freely,
8 subject to the following restrictions:
9 
10 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required.
11 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software.
12 3. This notice may not be removed or altered from any source distribution.
13 */
14 
15 #ifndef B3_VECTOR3_H
16 #define B3_VECTOR3_H
17 
18 //#include <stdint.h>
19 #include "b3Scalar.h"
20 #include "b3MinMax.h"
21 #include "b3AlignedAllocator.h"
22 
23 #ifdef B3_USE_DOUBLE_PRECISION
24 #define b3Vector3Data b3Vector3DoubleData
25 #define b3Vector3DataName "b3Vector3DoubleData"
26 #else
27 #define b3Vector3Data b3Vector3FloatData
28 #define b3Vector3DataName "b3Vector3FloatData"
29 #endif  //B3_USE_DOUBLE_PRECISION
30 
31 #if defined B3_USE_SSE
32 
33 //typedef  uint32_t __m128i __attribute__ ((vector_size(16)));
34 
35 #ifdef _MSC_VER
36 #pragma warning(disable : 4556)  // value of intrinsic immediate argument '4294967239' is out of range '0 - 255'
37 #endif
38 
39 #define B3_SHUFFLE(x, y, z, w) (((w) << 6 | (z) << 4 | (y) << 2 | (x)) & 0xff)
40 //#define b3_pshufd_ps( _a, _mask ) (__m128) _mm_shuffle_epi32((__m128i)(_a), (_mask) )
41 #define b3_pshufd_ps(_a, _mask) _mm_shuffle_ps((_a), (_a), (_mask))
42 #define b3_splat3_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, 3))
43 #define b3_splat_ps(_a, _i) b3_pshufd_ps((_a), B3_SHUFFLE(_i, _i, _i, _i))
44 
45 #define b3v3AbsiMask (_mm_set_epi32(0x00000000, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
46 #define b3vAbsMask (_mm_set_epi32(0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF))
47 #define b3vFFF0Mask (_mm_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF))
48 #define b3v3AbsfMask b3CastiTo128f(b3v3AbsiMask)
49 #define b3vFFF0fMask b3CastiTo128f(b3vFFF0Mask)
50 #define b3vxyzMaskf b3vFFF0fMask
51 #define b3vAbsfMask b3CastiTo128f(b3vAbsMask)
52 
53 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = {-0.0f, -0.0f, -0.0f, -0.0f};
54 const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1110) = {1.0f, 1.0f, 1.0f, 0.0f};
55 const __m128 B3_ATTRIBUTE_ALIGNED16(b3vHalf) = {0.5f, 0.5f, 0.5f, 0.5f};
56 const __m128 B3_ATTRIBUTE_ALIGNED16(b3v1_5) = {1.5f, 1.5f, 1.5f, 1.5f};
57 
58 #endif
59 
60 #ifdef B3_USE_NEON
61 
B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask)62 const float32x4_t B3_ATTRIBUTE_ALIGNED16(b3vMzeroMask) = (float32x4_t){-0.0f, -0.0f, -0.0f, -0.0f};
B3_ATTRIBUTE_ALIGNED16(b3vFFF0Mask)63 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vFFF0Mask) = (int32x4_t){0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x0};
B3_ATTRIBUTE_ALIGNED16(b3vAbsMask)64 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3vAbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF};
B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask)65 const int32x4_t B3_ATTRIBUTE_ALIGNED16(b3v3AbsMask) = (int32x4_t){0x7FFFFFFF, 0x7FFFFFFF, 0x7FFFFFFF, 0x0};
66 
67 #endif
68 
69 class b3Vector3;
70 class b3Vector4;
71 
72 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
73 //#if defined (B3_USE_SSE) || defined (B3_USE_NEON)
74 inline b3Vector3 b3MakeVector3(b3SimdFloat4 v);
75 inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec);
76 #endif
77 
78 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z);
79 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
80 inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w);
81 
82 /**@brief b3Vector3 can be used to represent 3D points and vectors.
83  * It has an un-used w component to suit 16-byte alignment when b3Vector3 is stored in containers. This extra component can be used by derived classes (Quaternion?) or by user
84  * Ideally, this class should be replaced by a platform optimized SIMD version that keeps the data in registers
85  */
B3_ATTRIBUTE_ALIGNED16(class)86 B3_ATTRIBUTE_ALIGNED16(class)
87 b3Vector3
88 {
89 public:
90 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
91 	union {
92 		b3SimdFloat4 mVec128;
93 		float m_floats[4];
94 		struct
95 		{
96 			float x, y, z, w;
97 		};
98 	};
99 #else
100 	union {
101 		float m_floats[4];
102 		struct
103 		{
104 			float x, y, z, w;
105 		};
106 	};
107 #endif
108 
109 public:
110 	B3_DECLARE_ALIGNED_ALLOCATOR();
111 
112 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)  // _WIN32 || ARM
113 
114 	/*B3_FORCE_INLINE		b3Vector3()
115 	{
116 	}
117 	*/
118 
119 	B3_FORCE_INLINE b3SimdFloat4 get128() const
120 	{
121 		return mVec128;
122 	}
123 	B3_FORCE_INLINE void set128(b3SimdFloat4 v128)
124 	{
125 		mVec128 = v128;
126 	}
127 #endif
128 
129 public:
130 	/**@brief Add a vector to this one
131  * @param The vector to add to this one */
132 	B3_FORCE_INLINE b3Vector3& operator+=(const b3Vector3& v)
133 	{
134 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
135 		mVec128 = _mm_add_ps(mVec128, v.mVec128);
136 #elif defined(B3_USE_NEON)
137 		mVec128 = vaddq_f32(mVec128, v.mVec128);
138 #else
139 		m_floats[0] += v.m_floats[0];
140 		m_floats[1] += v.m_floats[1];
141 		m_floats[2] += v.m_floats[2];
142 #endif
143 		return *this;
144 	}
145 
146 	/**@brief Subtract a vector from this one
147    * @param The vector to subtract */
148 	B3_FORCE_INLINE b3Vector3& operator-=(const b3Vector3& v)
149 	{
150 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
151 		mVec128 = _mm_sub_ps(mVec128, v.mVec128);
152 #elif defined(B3_USE_NEON)
153 		mVec128 = vsubq_f32(mVec128, v.mVec128);
154 #else
155 		m_floats[0] -= v.m_floats[0];
156 		m_floats[1] -= v.m_floats[1];
157 		m_floats[2] -= v.m_floats[2];
158 #endif
159 		return *this;
160 	}
161 
162 	/**@brief Scale the vector
163    * @param s Scale factor */
164 	B3_FORCE_INLINE b3Vector3& operator*=(const b3Scalar& s)
165 	{
166 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
167 		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
168 		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
169 		mVec128 = _mm_mul_ps(mVec128, vs);
170 #elif defined(B3_USE_NEON)
171 		mVec128 = vmulq_n_f32(mVec128, s);
172 #else
173 		m_floats[0] *= s;
174 		m_floats[1] *= s;
175 		m_floats[2] *= s;
176 #endif
177 		return *this;
178 	}
179 
180 	/**@brief Inversely scale the vector
181    * @param s Scale factor to divide by */
182 	B3_FORCE_INLINE b3Vector3& operator/=(const b3Scalar& s)
183 	{
184 		b3FullAssert(s != b3Scalar(0.0));
185 
186 #if 0  //defined(B3_USE_SSE_IN_API)
187 // this code is not faster !
188 		__m128 vs = _mm_load_ss(&s);
189 		vs = _mm_div_ss(b3v1110, vs);
190 		vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
191 
192 		mVec128 = _mm_mul_ps(mVec128, vs);
193 
194 		return *this;
195 #else
196 		return *this *= b3Scalar(1.0) / s;
197 #endif
198 	}
199 
200 	/**@brief Return the dot product
201    * @param v The other vector in the dot product */
202 	B3_FORCE_INLINE b3Scalar dot(const b3Vector3& v) const
203 	{
204 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
205 		__m128 vd = _mm_mul_ps(mVec128, v.mVec128);
206 		__m128 z = _mm_movehl_ps(vd, vd);
207 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
208 		vd = _mm_add_ss(vd, y);
209 		vd = _mm_add_ss(vd, z);
210 		return _mm_cvtss_f32(vd);
211 #elif defined(B3_USE_NEON)
212 		float32x4_t vd = vmulq_f32(mVec128, v.mVec128);
213 		float32x2_t x = vpadd_f32(vget_low_f32(vd), vget_low_f32(vd));
214 		x = vadd_f32(x, vget_high_f32(vd));
215 		return vget_lane_f32(x, 0);
216 #else
217 		return m_floats[0] * v.m_floats[0] +
218 			   m_floats[1] * v.m_floats[1] +
219 			   m_floats[2] * v.m_floats[2];
220 #endif
221 	}
222 
223 	/**@brief Return the length of the vector squared */
224 	B3_FORCE_INLINE b3Scalar length2() const
225 	{
226 		return dot(*this);
227 	}
228 
229 	/**@brief Return the length of the vector */
230 	B3_FORCE_INLINE b3Scalar length() const
231 	{
232 		return b3Sqrt(length2());
233 	}
234 
235 	/**@brief Return the distance squared between the ends of this and another vector
236    * This is symantically treating the vector like a point */
237 	B3_FORCE_INLINE b3Scalar distance2(const b3Vector3& v) const;
238 
239 	/**@brief Return the distance between the ends of this and another vector
240    * This is symantically treating the vector like a point */
241 	B3_FORCE_INLINE b3Scalar distance(const b3Vector3& v) const;
242 
243 	B3_FORCE_INLINE b3Vector3& safeNormalize()
244 	{
245 		b3Scalar l2 = length2();
246 		//triNormal.normalize();
247 		if (l2 >= B3_EPSILON * B3_EPSILON)
248 		{
249 			(*this) /= b3Sqrt(l2);
250 		}
251 		else
252 		{
253 			setValue(1, 0, 0);
254 		}
255 		return *this;
256 	}
257 
258 	/**@brief Normalize this vector
259    * x^2 + y^2 + z^2 = 1 */
260 	B3_FORCE_INLINE b3Vector3& normalize()
261 	{
262 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
263 		// dot product first
264 		__m128 vd = _mm_mul_ps(mVec128, mVec128);
265 		__m128 z = _mm_movehl_ps(vd, vd);
266 		__m128 y = _mm_shuffle_ps(vd, vd, 0x55);
267 		vd = _mm_add_ss(vd, y);
268 		vd = _mm_add_ss(vd, z);
269 
270 #if 0
271         vd = _mm_sqrt_ss(vd);
272 		vd = _mm_div_ss(b3v1110, vd);
273 		vd = b3_splat_ps(vd, 0x80);
274 		mVec128 = _mm_mul_ps(mVec128, vd);
275 #else
276 
277 		// NR step 1/sqrt(x) - vd is x, y is output
278 		y = _mm_rsqrt_ss(vd);  // estimate
279 
280 		//  one step NR
281 		z = b3v1_5;
282 		vd = _mm_mul_ss(vd, b3vHalf);  // vd * 0.5
283 		//x2 = vd;
284 		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0
285 		vd = _mm_mul_ss(vd, y);  // vd * 0.5 * y0 * y0
286 		z = _mm_sub_ss(z, vd);   // 1.5 - vd * 0.5 * y0 * y0
287 
288 		y = _mm_mul_ss(y, z);  // y0 * (1.5 - vd * 0.5 * y0 * y0)
289 
290 		y = b3_splat_ps(y, 0x80);
291 		mVec128 = _mm_mul_ps(mVec128, y);
292 
293 #endif
294 
295 		return *this;
296 #else
297 		return *this /= length();
298 #endif
299 	}
300 
301 	/**@brief Return a normalized version of this vector */
302 	B3_FORCE_INLINE b3Vector3 normalized() const;
303 
304 	/**@brief Return a rotated version of this vector
305    * @param wAxis The axis to rotate about
306    * @param angle The angle to rotate by */
307 	B3_FORCE_INLINE b3Vector3 rotate(const b3Vector3& wAxis, const b3Scalar angle) const;
308 
309 	/**@brief Return the angle between this and another vector
310    * @param v The other vector */
311 	B3_FORCE_INLINE b3Scalar angle(const b3Vector3& v) const
312 	{
313 		b3Scalar s = b3Sqrt(length2() * v.length2());
314 		b3FullAssert(s != b3Scalar(0.0));
315 		return b3Acos(dot(v) / s);
316 	}
317 
318 	/**@brief Return a vector will the absolute values of each element */
319 	B3_FORCE_INLINE b3Vector3 absolute() const
320 	{
321 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
322 		return b3MakeVector3(_mm_and_ps(mVec128, b3v3AbsfMask));
323 #elif defined(B3_USE_NEON)
324 		return b3Vector3(vabsq_f32(mVec128));
325 #else
326 		return b3MakeVector3(
327 			b3Fabs(m_floats[0]),
328 			b3Fabs(m_floats[1]),
329 			b3Fabs(m_floats[2]));
330 #endif
331 	}
332 
333 	/**@brief Return the cross product between this and another vector
334    * @param v The other vector */
335 	B3_FORCE_INLINE b3Vector3 cross(const b3Vector3& v) const
336 	{
337 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
338 		__m128 T, V;
339 
340 		T = b3_pshufd_ps(mVec128, B3_SHUFFLE(1, 2, 0, 3));    //	(Y Z X 0)
341 		V = b3_pshufd_ps(v.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
342 
343 		V = _mm_mul_ps(V, mVec128);
344 		T = _mm_mul_ps(T, v.mVec128);
345 		V = _mm_sub_ps(V, T);
346 
347 		V = b3_pshufd_ps(V, B3_SHUFFLE(1, 2, 0, 3));
348 		return b3MakeVector3(V);
349 #elif defined(B3_USE_NEON)
350 		float32x4_t T, V;
351 		// form (Y, Z, X, _) of mVec128 and v.mVec128
352 		float32x2_t Tlow = vget_low_f32(mVec128);
353 		float32x2_t Vlow = vget_low_f32(v.mVec128);
354 		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(mVec128), 1), Tlow);
355 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v.mVec128), 1), Vlow);
356 
357 		V = vmulq_f32(V, mVec128);
358 		T = vmulq_f32(T, v.mVec128);
359 		V = vsubq_f32(V, T);
360 		Vlow = vget_low_f32(V);
361 		// form (Y, Z, X, _);
362 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
363 		V = (float32x4_t)vandq_s32((int32x4_t)V, b3vFFF0Mask);
364 
365 		return b3Vector3(V);
366 #else
367 		return b3MakeVector3(
368 			m_floats[1] * v.m_floats[2] - m_floats[2] * v.m_floats[1],
369 			m_floats[2] * v.m_floats[0] - m_floats[0] * v.m_floats[2],
370 			m_floats[0] * v.m_floats[1] - m_floats[1] * v.m_floats[0]);
371 #endif
372 	}
373 
374 	B3_FORCE_INLINE b3Scalar triple(const b3Vector3& v1, const b3Vector3& v2) const
375 	{
376 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
377 		// cross:
378 		__m128 T = _mm_shuffle_ps(v1.mVec128, v1.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
379 		__m128 V = _mm_shuffle_ps(v2.mVec128, v2.mVec128, B3_SHUFFLE(1, 2, 0, 3));  //	(Y Z X 0)
380 
381 		V = _mm_mul_ps(V, v1.mVec128);
382 		T = _mm_mul_ps(T, v2.mVec128);
383 		V = _mm_sub_ps(V, T);
384 
385 		V = _mm_shuffle_ps(V, V, B3_SHUFFLE(1, 2, 0, 3));
386 
387 		// dot:
388 		V = _mm_mul_ps(V, mVec128);
389 		__m128 z = _mm_movehl_ps(V, V);
390 		__m128 y = _mm_shuffle_ps(V, V, 0x55);
391 		V = _mm_add_ss(V, y);
392 		V = _mm_add_ss(V, z);
393 		return _mm_cvtss_f32(V);
394 
395 #elif defined(B3_USE_NEON)
396 		// cross:
397 		float32x4_t T, V;
398 		// form (Y, Z, X, _) of mVec128 and v.mVec128
399 		float32x2_t Tlow = vget_low_f32(v1.mVec128);
400 		float32x2_t Vlow = vget_low_f32(v2.mVec128);
401 		T = vcombine_f32(vext_f32(Tlow, vget_high_f32(v1.mVec128), 1), Tlow);
402 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(v2.mVec128), 1), Vlow);
403 
404 		V = vmulq_f32(V, v1.mVec128);
405 		T = vmulq_f32(T, v2.mVec128);
406 		V = vsubq_f32(V, T);
407 		Vlow = vget_low_f32(V);
408 		// form (Y, Z, X, _);
409 		V = vcombine_f32(vext_f32(Vlow, vget_high_f32(V), 1), Vlow);
410 
411 		// dot:
412 		V = vmulq_f32(mVec128, V);
413 		float32x2_t x = vpadd_f32(vget_low_f32(V), vget_low_f32(V));
414 		x = vadd_f32(x, vget_high_f32(V));
415 		return vget_lane_f32(x, 0);
416 #else
417 		return m_floats[0] * (v1.m_floats[1] * v2.m_floats[2] - v1.m_floats[2] * v2.m_floats[1]) +
418 			   m_floats[1] * (v1.m_floats[2] * v2.m_floats[0] - v1.m_floats[0] * v2.m_floats[2]) +
419 			   m_floats[2] * (v1.m_floats[0] * v2.m_floats[1] - v1.m_floats[1] * v2.m_floats[0]);
420 #endif
421 	}
422 
423 	/**@brief Return the axis with the smallest value
424    * Note return values are 0,1,2 for x, y, or z */
425 	B3_FORCE_INLINE int minAxis() const
426 	{
427 		return m_floats[0] < m_floats[1] ? (m_floats[0] < m_floats[2] ? 0 : 2) : (m_floats[1] < m_floats[2] ? 1 : 2);
428 	}
429 
430 	/**@brief Return the axis with the largest value
431    * Note return values are 0,1,2 for x, y, or z */
432 	B3_FORCE_INLINE int maxAxis() const
433 	{
434 		return m_floats[0] < m_floats[1] ? (m_floats[1] < m_floats[2] ? 2 : 1) : (m_floats[0] < m_floats[2] ? 2 : 0);
435 	}
436 
437 	B3_FORCE_INLINE int furthestAxis() const
438 	{
439 		return absolute().minAxis();
440 	}
441 
442 	B3_FORCE_INLINE int closestAxis() const
443 	{
444 		return absolute().maxAxis();
445 	}
446 
447 	B3_FORCE_INLINE void setInterpolate3(const b3Vector3& v0, const b3Vector3& v1, b3Scalar rt)
448 	{
449 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
450 		__m128 vrt = _mm_load_ss(&rt);  //	(rt 0 0 0)
451 		b3Scalar s = b3Scalar(1.0) - rt;
452 		__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
453 		vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
454 		__m128 r0 = _mm_mul_ps(v0.mVec128, vs);
455 		vrt = b3_pshufd_ps(vrt, 0x80);  //	(rt rt rt 0.0)
456 		__m128 r1 = _mm_mul_ps(v1.mVec128, vrt);
457 		__m128 tmp3 = _mm_add_ps(r0, r1);
458 		mVec128 = tmp3;
459 #elif defined(B3_USE_NEON)
460 		float32x4_t vl = vsubq_f32(v1.mVec128, v0.mVec128);
461 		vl = vmulq_n_f32(vl, rt);
462 		mVec128 = vaddq_f32(vl, v0.mVec128);
463 #else
464 		b3Scalar s = b3Scalar(1.0) - rt;
465 		m_floats[0] = s * v0.m_floats[0] + rt * v1.m_floats[0];
466 		m_floats[1] = s * v0.m_floats[1] + rt * v1.m_floats[1];
467 		m_floats[2] = s * v0.m_floats[2] + rt * v1.m_floats[2];
468 		//don't do the unused w component
469 		//		m_co[3] = s * v0[3] + rt * v1[3];
470 #endif
471 	}
472 
473 	/**@brief Return the linear interpolation between this and another vector
474    * @param v The other vector
475    * @param t The ration of this to v (t = 0 => return this, t=1 => return other) */
476 	B3_FORCE_INLINE b3Vector3 lerp(const b3Vector3& v, const b3Scalar& t) const
477 	{
478 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
479 		__m128 vt = _mm_load_ss(&t);  //	(t 0 0 0)
480 		vt = b3_pshufd_ps(vt, 0x80);  //	(rt rt rt 0.0)
481 		__m128 vl = _mm_sub_ps(v.mVec128, mVec128);
482 		vl = _mm_mul_ps(vl, vt);
483 		vl = _mm_add_ps(vl, mVec128);
484 
485 		return b3MakeVector3(vl);
486 #elif defined(B3_USE_NEON)
487 		float32x4_t vl = vsubq_f32(v.mVec128, mVec128);
488 		vl = vmulq_n_f32(vl, t);
489 		vl = vaddq_f32(vl, mVec128);
490 
491 		return b3Vector3(vl);
492 #else
493 		return b3MakeVector3(m_floats[0] + (v.m_floats[0] - m_floats[0]) * t,
494 							 m_floats[1] + (v.m_floats[1] - m_floats[1]) * t,
495 							 m_floats[2] + (v.m_floats[2] - m_floats[2]) * t);
496 #endif
497 	}
498 
499 	/**@brief Elementwise multiply this vector by the other
500    * @param v The other vector */
501 	B3_FORCE_INLINE b3Vector3& operator*=(const b3Vector3& v)
502 	{
503 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
504 		mVec128 = _mm_mul_ps(mVec128, v.mVec128);
505 #elif defined(B3_USE_NEON)
506 		mVec128 = vmulq_f32(mVec128, v.mVec128);
507 #else
508 		m_floats[0] *= v.m_floats[0];
509 		m_floats[1] *= v.m_floats[1];
510 		m_floats[2] *= v.m_floats[2];
511 #endif
512 		return *this;
513 	}
514 
515 	/**@brief Return the x value */
516 	B3_FORCE_INLINE const b3Scalar& getX() const { return m_floats[0]; }
517 	/**@brief Return the y value */
518 	B3_FORCE_INLINE const b3Scalar& getY() const { return m_floats[1]; }
519 	/**@brief Return the z value */
520 	B3_FORCE_INLINE const b3Scalar& getZ() const { return m_floats[2]; }
521 	/**@brief Return the w value */
522 	B3_FORCE_INLINE const b3Scalar& getW() const { return m_floats[3]; }
523 
524 	/**@brief Set the x value */
525 	B3_FORCE_INLINE void setX(b3Scalar _x) { m_floats[0] = _x; };
526 	/**@brief Set the y value */
527 	B3_FORCE_INLINE void setY(b3Scalar _y) { m_floats[1] = _y; };
528 	/**@brief Set the z value */
529 	B3_FORCE_INLINE void setZ(b3Scalar _z) { m_floats[2] = _z; };
530 	/**@brief Set the w value */
531 	B3_FORCE_INLINE void setW(b3Scalar _w) { m_floats[3] = _w; };
532 
533 	//B3_FORCE_INLINE b3Scalar&       operator[](int i)       { return (&m_floats[0])[i];	}
534 	//B3_FORCE_INLINE const b3Scalar& operator[](int i) const { return (&m_floats[0])[i]; }
535 	///operator b3Scalar*() replaces operator[], using implicit conversion. We added operator != and operator == to avoid pointer comparisons.
536 	B3_FORCE_INLINE operator b3Scalar*() { return &m_floats[0]; }
537 	B3_FORCE_INLINE operator const b3Scalar*() const { return &m_floats[0]; }
538 
539 	B3_FORCE_INLINE bool operator==(const b3Vector3& other) const
540 	{
541 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
542 		return (0xf == _mm_movemask_ps((__m128)_mm_cmpeq_ps(mVec128, other.mVec128)));
543 #else
544 		return ((m_floats[3] == other.m_floats[3]) &&
545 				(m_floats[2] == other.m_floats[2]) &&
546 				(m_floats[1] == other.m_floats[1]) &&
547 				(m_floats[0] == other.m_floats[0]));
548 #endif
549 	}
550 
551 	B3_FORCE_INLINE bool operator!=(const b3Vector3& other) const
552 	{
553 		return !(*this == other);
554 	}
555 
556 	/**@brief Set each element to the max of the current values and the values of another b3Vector3
557    * @param other The other b3Vector3 to compare with
558    */
559 	B3_FORCE_INLINE void setMax(const b3Vector3& other)
560 	{
561 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
562 		mVec128 = _mm_max_ps(mVec128, other.mVec128);
563 #elif defined(B3_USE_NEON)
564 		mVec128 = vmaxq_f32(mVec128, other.mVec128);
565 #else
566 		b3SetMax(m_floats[0], other.m_floats[0]);
567 		b3SetMax(m_floats[1], other.m_floats[1]);
568 		b3SetMax(m_floats[2], other.m_floats[2]);
569 		b3SetMax(m_floats[3], other.m_floats[3]);
570 #endif
571 	}
572 
573 	/**@brief Set each element to the min of the current values and the values of another b3Vector3
574    * @param other The other b3Vector3 to compare with
575    */
576 	B3_FORCE_INLINE void setMin(const b3Vector3& other)
577 	{
578 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
579 		mVec128 = _mm_min_ps(mVec128, other.mVec128);
580 #elif defined(B3_USE_NEON)
581 		mVec128 = vminq_f32(mVec128, other.mVec128);
582 #else
583 		b3SetMin(m_floats[0], other.m_floats[0]);
584 		b3SetMin(m_floats[1], other.m_floats[1]);
585 		b3SetMin(m_floats[2], other.m_floats[2]);
586 		b3SetMin(m_floats[3], other.m_floats[3]);
587 #endif
588 	}
589 
590 	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z)
591 	{
592 		m_floats[0] = _x;
593 		m_floats[1] = _y;
594 		m_floats[2] = _z;
595 		m_floats[3] = b3Scalar(0.f);
596 	}
597 
598 	void getSkewSymmetricMatrix(b3Vector3 * v0, b3Vector3 * v1, b3Vector3 * v2) const
599 	{
600 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
601 
602 		__m128 V = _mm_and_ps(mVec128, b3vFFF0fMask);
603 		__m128 V0 = _mm_xor_ps(b3vMzeroMask, V);
604 		__m128 V2 = _mm_movelh_ps(V0, V);
605 
606 		__m128 V1 = _mm_shuffle_ps(V, V0, 0xCE);
607 
608 		V0 = _mm_shuffle_ps(V0, V, 0xDB);
609 		V2 = _mm_shuffle_ps(V2, V, 0xF9);
610 
611 		v0->mVec128 = V0;
612 		v1->mVec128 = V1;
613 		v2->mVec128 = V2;
614 #else
615 		v0->setValue(0., -getZ(), getY());
616 		v1->setValue(getZ(), 0., -getX());
617 		v2->setValue(-getY(), getX(), 0.);
618 #endif
619 	}
620 
621 	void setZero()
622 	{
623 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
624 		mVec128 = (__m128)_mm_xor_ps(mVec128, mVec128);
625 #elif defined(B3_USE_NEON)
626 		int32x4_t vi = vdupq_n_s32(0);
627 		mVec128 = vreinterpretq_f32_s32(vi);
628 #else
629 		setValue(b3Scalar(0.), b3Scalar(0.), b3Scalar(0.));
630 #endif
631 	}
632 
633 	B3_FORCE_INLINE bool isZero() const
634 	{
635 		return m_floats[0] == b3Scalar(0) && m_floats[1] == b3Scalar(0) && m_floats[2] == b3Scalar(0);
636 	}
637 
638 	B3_FORCE_INLINE bool fuzzyZero() const
639 	{
640 		return length2() < B3_EPSILON;
641 	}
642 
643 	B3_FORCE_INLINE void serialize(struct b3Vector3Data & dataOut) const;
644 
645 	B3_FORCE_INLINE void deSerialize(const struct b3Vector3Data& dataIn);
646 
647 	B3_FORCE_INLINE void serializeFloat(struct b3Vector3FloatData & dataOut) const;
648 
649 	B3_FORCE_INLINE void deSerializeFloat(const struct b3Vector3FloatData& dataIn);
650 
651 	B3_FORCE_INLINE void serializeDouble(struct b3Vector3DoubleData & dataOut) const;
652 
653 	B3_FORCE_INLINE void deSerializeDouble(const struct b3Vector3DoubleData& dataIn);
654 
655 	/**@brief returns index of maximum dot product between this and vectors in array[]
656          * @param array The other vectors
657          * @param array_count The number of other vectors
658          * @param dotOut The maximum dot product */
659 	B3_FORCE_INLINE long maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
660 
661 	/**@brief returns index of minimum dot product between this and vectors in array[]
662          * @param array The other vectors
663          * @param array_count The number of other vectors
664          * @param dotOut The minimum dot product */
665 	B3_FORCE_INLINE long minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const;
666 
667 	/* create a vector as  b3Vector3( this->dot( b3Vector3 v0 ), this->dot( b3Vector3 v1), this->dot( b3Vector3 v2 ))  */
668 	B3_FORCE_INLINE b3Vector3 dot3(const b3Vector3& v0, const b3Vector3& v1, const b3Vector3& v2) const
669 	{
670 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
671 
672 		__m128 a0 = _mm_mul_ps(v0.mVec128, this->mVec128);
673 		__m128 a1 = _mm_mul_ps(v1.mVec128, this->mVec128);
674 		__m128 a2 = _mm_mul_ps(v2.mVec128, this->mVec128);
675 		__m128 b0 = _mm_unpacklo_ps(a0, a1);
676 		__m128 b1 = _mm_unpackhi_ps(a0, a1);
677 		__m128 b2 = _mm_unpacklo_ps(a2, _mm_setzero_ps());
678 		__m128 r = _mm_movelh_ps(b0, b2);
679 		r = _mm_add_ps(r, _mm_movehl_ps(b2, b0));
680 		a2 = _mm_and_ps(a2, b3vxyzMaskf);
681 		r = _mm_add_ps(r, b3CastdTo128f(_mm_move_sd(b3CastfTo128d(a2), b3CastfTo128d(b1))));
682 		return b3MakeVector3(r);
683 
684 #elif defined(B3_USE_NEON)
685 		static const uint32x4_t xyzMask = (const uint32x4_t){-1, -1, -1, 0};
686 		float32x4_t a0 = vmulq_f32(v0.mVec128, this->mVec128);
687 		float32x4_t a1 = vmulq_f32(v1.mVec128, this->mVec128);
688 		float32x4_t a2 = vmulq_f32(v2.mVec128, this->mVec128);
689 		float32x2x2_t zLo = vtrn_f32(vget_high_f32(a0), vget_high_f32(a1));
690 		a2 = (float32x4_t)vandq_u32((uint32x4_t)a2, xyzMask);
691 		float32x2_t b0 = vadd_f32(vpadd_f32(vget_low_f32(a0), vget_low_f32(a1)), zLo.val[0]);
692 		float32x2_t b1 = vpadd_f32(vpadd_f32(vget_low_f32(a2), vget_high_f32(a2)), vdup_n_f32(0.0f));
693 		return b3Vector3(vcombine_f32(b0, b1));
694 #else
695 		return b3MakeVector3(dot(v0), dot(v1), dot(v2));
696 #endif
697 	}
698 };
699 
700 /**@brief Return the sum of two vectors (Point symantics)*/
701 B3_FORCE_INLINE b3Vector3
702 operator+(const b3Vector3& v1, const b3Vector3& v2)
703 {
704 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
705 	return b3MakeVector3(_mm_add_ps(v1.mVec128, v2.mVec128));
706 #elif defined(B3_USE_NEON)
707 	return b3MakeVector3(vaddq_f32(v1.mVec128, v2.mVec128));
708 #else
709 	return b3MakeVector3(
710 		v1.m_floats[0] + v2.m_floats[0],
711 		v1.m_floats[1] + v2.m_floats[1],
712 		v1.m_floats[2] + v2.m_floats[2]);
713 #endif
714 }
715 
716 /**@brief Return the elementwise product of two vectors */
717 B3_FORCE_INLINE b3Vector3
718 operator*(const b3Vector3& v1, const b3Vector3& v2)
719 {
720 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
721 	return b3MakeVector3(_mm_mul_ps(v1.mVec128, v2.mVec128));
722 #elif defined(B3_USE_NEON)
723 	return b3MakeVector3(vmulq_f32(v1.mVec128, v2.mVec128));
724 #else
725 	return b3MakeVector3(
726 		v1.m_floats[0] * v2.m_floats[0],
727 		v1.m_floats[1] * v2.m_floats[1],
728 		v1.m_floats[2] * v2.m_floats[2]);
729 #endif
730 }
731 
732 /**@brief Return the difference between two vectors */
733 B3_FORCE_INLINE b3Vector3
734 operator-(const b3Vector3& v1, const b3Vector3& v2)
735 {
736 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
737 
738 	//	without _mm_and_ps this code causes slowdown in Concave moving
739 	__m128 r = _mm_sub_ps(v1.mVec128, v2.mVec128);
740 	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
741 #elif defined(B3_USE_NEON)
742 	float32x4_t r = vsubq_f32(v1.mVec128, v2.mVec128);
743 	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
744 #else
745 	return b3MakeVector3(
746 		v1.m_floats[0] - v2.m_floats[0],
747 		v1.m_floats[1] - v2.m_floats[1],
748 		v1.m_floats[2] - v2.m_floats[2]);
749 #endif
750 }
751 
752 /**@brief Return the negative of the vector */
753 B3_FORCE_INLINE b3Vector3
754 operator-(const b3Vector3& v)
755 {
756 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
757 	__m128 r = _mm_xor_ps(v.mVec128, b3vMzeroMask);
758 	return b3MakeVector3(_mm_and_ps(r, b3vFFF0fMask));
759 #elif defined(B3_USE_NEON)
760 	return b3MakeVector3((b3SimdFloat4)veorq_s32((int32x4_t)v.mVec128, (int32x4_t)b3vMzeroMask));
761 #else
762 	return b3MakeVector3(-v.m_floats[0], -v.m_floats[1], -v.m_floats[2]);
763 #endif
764 }
765 
766 /**@brief Return the vector scaled by s */
767 B3_FORCE_INLINE b3Vector3
768 operator*(const b3Vector3& v, const b3Scalar& s)
769 {
770 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
771 	__m128 vs = _mm_load_ss(&s);  //	(S 0 0 0)
772 	vs = b3_pshufd_ps(vs, 0x80);  //	(S S S 0.0)
773 	return b3MakeVector3(_mm_mul_ps(v.mVec128, vs));
774 #elif defined(B3_USE_NEON)
775 	float32x4_t r = vmulq_n_f32(v.mVec128, s);
776 	return b3MakeVector3((float32x4_t)vandq_s32((int32x4_t)r, b3vFFF0Mask));
777 #else
778 	return b3MakeVector3(v.m_floats[0] * s, v.m_floats[1] * s, v.m_floats[2] * s);
779 #endif
780 }
781 
782 /**@brief Return the vector scaled by s */
783 B3_FORCE_INLINE b3Vector3
784 operator*(const b3Scalar& s, const b3Vector3& v)
785 {
786 	return v * s;
787 }
788 
789 /**@brief Return the vector inversely scaled by s */
790 B3_FORCE_INLINE b3Vector3
791 operator/(const b3Vector3& v, const b3Scalar& s)
792 {
793 	b3FullAssert(s != b3Scalar(0.0));
794 #if 0  //defined(B3_USE_SSE_IN_API)
795 // this code is not faster !
796 	__m128 vs = _mm_load_ss(&s);
797     vs = _mm_div_ss(b3v1110, vs);
798 	vs = b3_pshufd_ps(vs, 0x00);	//	(S S S S)
799 
800 	return b3Vector3(_mm_mul_ps(v.mVec128, vs));
801 #else
802 	return v * (b3Scalar(1.0) / s);
803 #endif
804 }
805 
806 /**@brief Return the vector inversely scaled by s */
807 B3_FORCE_INLINE b3Vector3
808 operator/(const b3Vector3& v1, const b3Vector3& v2)
809 {
810 #if (defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE))
811 	__m128 vec = _mm_div_ps(v1.mVec128, v2.mVec128);
812 	vec = _mm_and_ps(vec, b3vFFF0fMask);
813 	return b3MakeVector3(vec);
814 #elif defined(B3_USE_NEON)
815 	float32x4_t x, y, v, m;
816 
817 	x = v1.mVec128;
818 	y = v2.mVec128;
819 
820 	v = vrecpeq_f32(y);     // v ~ 1/y
821 	m = vrecpsq_f32(y, v);  // m = (2-v*y)
822 	v = vmulq_f32(v, m);    // vv = v*m ~~ 1/y
823 	m = vrecpsq_f32(y, v);  // mm = (2-vv*y)
824 	v = vmulq_f32(v, x);    // x*vv
825 	v = vmulq_f32(v, m);    // (x*vv)*(2-vv*y) = x*(vv(2-vv*y)) ~~~ x/y
826 
827 	return b3Vector3(v);
828 #else
829 	return b3MakeVector3(
830 		v1.m_floats[0] / v2.m_floats[0],
831 		v1.m_floats[1] / v2.m_floats[1],
832 		v1.m_floats[2] / v2.m_floats[2]);
833 #endif
834 }
835 
836 /**@brief Return the dot product between two vectors */
837 B3_FORCE_INLINE b3Scalar
b3Dot(const b3Vector3 & v1,const b3Vector3 & v2)838 b3Dot(const b3Vector3& v1, const b3Vector3& v2)
839 {
840 	return v1.dot(v2);
841 }
842 
843 /**@brief Return the distance squared between two vectors */
844 B3_FORCE_INLINE b3Scalar
b3Distance2(const b3Vector3 & v1,const b3Vector3 & v2)845 b3Distance2(const b3Vector3& v1, const b3Vector3& v2)
846 {
847 	return v1.distance2(v2);
848 }
849 
850 /**@brief Return the distance between two vectors */
851 B3_FORCE_INLINE b3Scalar
b3Distance(const b3Vector3 & v1,const b3Vector3 & v2)852 b3Distance(const b3Vector3& v1, const b3Vector3& v2)
853 {
854 	return v1.distance(v2);
855 }
856 
857 /**@brief Return the angle between two vectors */
858 B3_FORCE_INLINE b3Scalar
b3Angle(const b3Vector3 & v1,const b3Vector3 & v2)859 b3Angle(const b3Vector3& v1, const b3Vector3& v2)
860 {
861 	return v1.angle(v2);
862 }
863 
864 /**@brief Return the cross product of two vectors */
865 B3_FORCE_INLINE b3Vector3
b3Cross(const b3Vector3 & v1,const b3Vector3 & v2)866 b3Cross(const b3Vector3& v1, const b3Vector3& v2)
867 {
868 	return v1.cross(v2);
869 }
870 
871 B3_FORCE_INLINE b3Scalar
b3Triple(const b3Vector3 & v1,const b3Vector3 & v2,const b3Vector3 & v3)872 b3Triple(const b3Vector3& v1, const b3Vector3& v2, const b3Vector3& v3)
873 {
874 	return v1.triple(v2, v3);
875 }
876 
877 /**@brief Return the linear interpolation between two vectors
878  * @param v1 One vector
879  * @param v2 The other vector
880  * @param t The ration of this to v (t = 0 => return v1, t=1 => return v2) */
881 B3_FORCE_INLINE b3Vector3
b3Lerp(const b3Vector3 & v1,const b3Vector3 & v2,const b3Scalar & t)882 b3Lerp(const b3Vector3& v1, const b3Vector3& v2, const b3Scalar& t)
883 {
884 	return v1.lerp(v2, t);
885 }
886 
distance2(const b3Vector3 & v)887 B3_FORCE_INLINE b3Scalar b3Vector3::distance2(const b3Vector3& v) const
888 {
889 	return (v - *this).length2();
890 }
891 
distance(const b3Vector3 & v)892 B3_FORCE_INLINE b3Scalar b3Vector3::distance(const b3Vector3& v) const
893 {
894 	return (v - *this).length();
895 }
896 
normalized()897 B3_FORCE_INLINE b3Vector3 b3Vector3::normalized() const
898 {
899 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
900 	b3Vector3 norm = *this;
901 
902 	return norm.normalize();
903 #else
904 	return *this / length();
905 #endif
906 }
907 
rotate(const b3Vector3 & wAxis,const b3Scalar _angle)908 B3_FORCE_INLINE b3Vector3 b3Vector3::rotate(const b3Vector3& wAxis, const b3Scalar _angle) const
909 {
910 	// wAxis must be a unit lenght vector
911 
912 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
913 
914 	__m128 O = _mm_mul_ps(wAxis.mVec128, mVec128);
915 	b3Scalar ssin = b3Sin(_angle);
916 	__m128 C = wAxis.cross(b3MakeVector3(mVec128)).mVec128;
917 	O = _mm_and_ps(O, b3vFFF0fMask);
918 	b3Scalar scos = b3Cos(_angle);
919 
920 	__m128 vsin = _mm_load_ss(&ssin);  //	(S 0 0 0)
921 	__m128 vcos = _mm_load_ss(&scos);  //	(S 0 0 0)
922 
923 	__m128 Y = b3_pshufd_ps(O, 0xC9);  //	(Y Z X 0)
924 	__m128 Z = b3_pshufd_ps(O, 0xD2);  //	(Z X Y 0)
925 	O = _mm_add_ps(O, Y);
926 	vsin = b3_pshufd_ps(vsin, 0x80);  //	(S S S 0)
927 	O = _mm_add_ps(O, Z);
928 	vcos = b3_pshufd_ps(vcos, 0x80);  //	(S S S 0)
929 
930 	vsin = vsin * C;
931 	O = O * wAxis.mVec128;
932 	__m128 X = mVec128 - O;
933 
934 	O = O + vsin;
935 	vcos = vcos * X;
936 	O = O + vcos;
937 
938 	return b3MakeVector3(O);
939 #else
940 	b3Vector3 o = wAxis * wAxis.dot(*this);
941 	b3Vector3 _x = *this - o;
942 	b3Vector3 _y;
943 
944 	_y = wAxis.cross(*this);
945 
946 	return (o + _x * b3Cos(_angle) + _y * b3Sin(_angle));
947 #endif
948 }
949 
maxDot(const b3Vector3 * array,long array_count,b3Scalar & dotOut)950 B3_FORCE_INLINE long b3Vector3::maxDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
951 {
952 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
953 #if defined _WIN32 || defined(B3_USE_SSE)
954 	const long scalar_cutoff = 10;
955 	long b3_maxdot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
956 #elif defined B3_USE_NEON
957 	const long scalar_cutoff = 4;
958 	extern long (*_maxdot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
959 #endif
960 	if (array_count < scalar_cutoff)
961 #else
962 
963 #endif  //B3_USE_SSE || B3_USE_NEON
964 	{
965 		b3Scalar maxDot = -B3_INFINITY;
966 		int i = 0;
967 		int ptIndex = -1;
968 		for (i = 0; i < array_count; i++)
969 		{
970 			b3Scalar dot = array[i].dot(*this);
971 
972 			if (dot > maxDot)
973 			{
974 				maxDot = dot;
975 				ptIndex = i;
976 			}
977 		}
978 
979 		b3Assert(ptIndex >= 0);
980 		if (ptIndex < 0)
981 		{
982 			ptIndex = 0;
983 		}
984 		dotOut = maxDot;
985 		return ptIndex;
986 	}
987 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
988 	return b3_maxdot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
989 #endif
990 }
991 
minDot(const b3Vector3 * array,long array_count,b3Scalar & dotOut)992 B3_FORCE_INLINE long b3Vector3::minDot(const b3Vector3* array, long array_count, b3Scalar& dotOut) const
993 {
994 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
995 #if defined B3_USE_SSE
996 	const long scalar_cutoff = 10;
997 	long b3_mindot_large(const float* array, const float* vec, unsigned long array_count, float* dotOut);
998 #elif defined B3_USE_NEON
999 	const long scalar_cutoff = 4;
1000 	extern long (*b3_mindot_large)(const float* array, const float* vec, unsigned long array_count, float* dotOut);
1001 #else
1002 #error unhandled arch!
1003 #endif
1004 
1005 	if (array_count < scalar_cutoff)
1006 #endif  //B3_USE_SSE || B3_USE_NEON
1007 	{
1008 		b3Scalar minDot = B3_INFINITY;
1009 		int i = 0;
1010 		int ptIndex = -1;
1011 
1012 		for (i = 0; i < array_count; i++)
1013 		{
1014 			b3Scalar dot = array[i].dot(*this);
1015 
1016 			if (dot < minDot)
1017 			{
1018 				minDot = dot;
1019 				ptIndex = i;
1020 			}
1021 		}
1022 
1023 		dotOut = minDot;
1024 
1025 		return ptIndex;
1026 	}
1027 #if defined(B3_USE_SSE) || defined(B3_USE_NEON)
1028 	return b3_mindot_large((float*)array, (float*)&m_floats[0], array_count, &dotOut);
1029 #endif
1030 }
1031 
1032 class b3Vector4 : public b3Vector3
1033 {
1034 public:
absolute4()1035 	B3_FORCE_INLINE b3Vector4 absolute4() const
1036 	{
1037 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
1038 		return b3MakeVector4(_mm_and_ps(mVec128, b3vAbsfMask));
1039 #elif defined(B3_USE_NEON)
1040 		return b3Vector4(vabsq_f32(mVec128));
1041 #else
1042 		return b3MakeVector4(
1043 			b3Fabs(m_floats[0]),
1044 			b3Fabs(m_floats[1]),
1045 			b3Fabs(m_floats[2]),
1046 			b3Fabs(m_floats[3]));
1047 #endif
1048 	}
1049 
getW()1050 	b3Scalar getW() const { return m_floats[3]; }
1051 
maxAxis4()1052 	B3_FORCE_INLINE int maxAxis4() const
1053 	{
1054 		int maxIndex = -1;
1055 		b3Scalar maxVal = b3Scalar(-B3_LARGE_FLOAT);
1056 		if (m_floats[0] > maxVal)
1057 		{
1058 			maxIndex = 0;
1059 			maxVal = m_floats[0];
1060 		}
1061 		if (m_floats[1] > maxVal)
1062 		{
1063 			maxIndex = 1;
1064 			maxVal = m_floats[1];
1065 		}
1066 		if (m_floats[2] > maxVal)
1067 		{
1068 			maxIndex = 2;
1069 			maxVal = m_floats[2];
1070 		}
1071 		if (m_floats[3] > maxVal)
1072 		{
1073 			maxIndex = 3;
1074 		}
1075 
1076 		return maxIndex;
1077 	}
1078 
minAxis4()1079 	B3_FORCE_INLINE int minAxis4() const
1080 	{
1081 		int minIndex = -1;
1082 		b3Scalar minVal = b3Scalar(B3_LARGE_FLOAT);
1083 		if (m_floats[0] < minVal)
1084 		{
1085 			minIndex = 0;
1086 			minVal = m_floats[0];
1087 		}
1088 		if (m_floats[1] < minVal)
1089 		{
1090 			minIndex = 1;
1091 			minVal = m_floats[1];
1092 		}
1093 		if (m_floats[2] < minVal)
1094 		{
1095 			minIndex = 2;
1096 			minVal = m_floats[2];
1097 		}
1098 		if (m_floats[3] < minVal)
1099 		{
1100 			minIndex = 3;
1101 			minVal = m_floats[3];
1102 		}
1103 
1104 		return minIndex;
1105 	}
1106 
closestAxis4()1107 	B3_FORCE_INLINE int closestAxis4() const
1108 	{
1109 		return absolute4().maxAxis4();
1110 	}
1111 
1112 	/**@brief Set x,y,z and zero w
1113    * @param x Value of x
1114    * @param y Value of y
1115    * @param z Value of z
1116    */
1117 
1118 	/*		void getValue(b3Scalar *m) const
1119 		{
1120 			m[0] = m_floats[0];
1121 			m[1] = m_floats[1];
1122 			m[2] =m_floats[2];
1123 		}
1124 */
1125 	/**@brief Set the values
1126    * @param x Value of x
1127    * @param y Value of y
1128    * @param z Value of z
1129    * @param w Value of w
1130    */
setValue(const b3Scalar & _x,const b3Scalar & _y,const b3Scalar & _z,const b3Scalar & _w)1131 	B3_FORCE_INLINE void setValue(const b3Scalar& _x, const b3Scalar& _y, const b3Scalar& _z, const b3Scalar& _w)
1132 	{
1133 		m_floats[0] = _x;
1134 		m_floats[1] = _y;
1135 		m_floats[2] = _z;
1136 		m_floats[3] = _w;
1137 	}
1138 };
1139 
1140 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
b3SwapScalarEndian(const b3Scalar & sourceVal,b3Scalar & destVal)1141 B3_FORCE_INLINE void b3SwapScalarEndian(const b3Scalar& sourceVal, b3Scalar& destVal)
1142 {
1143 #ifdef B3_USE_DOUBLE_PRECISION
1144 	unsigned char* dest = (unsigned char*)&destVal;
1145 	unsigned char* src = (unsigned char*)&sourceVal;
1146 	dest[0] = src[7];
1147 	dest[1] = src[6];
1148 	dest[2] = src[5];
1149 	dest[3] = src[4];
1150 	dest[4] = src[3];
1151 	dest[5] = src[2];
1152 	dest[6] = src[1];
1153 	dest[7] = src[0];
1154 #else
1155 	unsigned char* dest = (unsigned char*)&destVal;
1156 	unsigned char* src = (unsigned char*)&sourceVal;
1157 	dest[0] = src[3];
1158 	dest[1] = src[2];
1159 	dest[2] = src[1];
1160 	dest[3] = src[0];
1161 #endif  //B3_USE_DOUBLE_PRECISION
1162 }
1163 ///b3SwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
b3SwapVector3Endian(const b3Vector3 & sourceVec,b3Vector3 & destVec)1164 B3_FORCE_INLINE void b3SwapVector3Endian(const b3Vector3& sourceVec, b3Vector3& destVec)
1165 {
1166 	for (int i = 0; i < 4; i++)
1167 	{
1168 		b3SwapScalarEndian(sourceVec[i], destVec[i]);
1169 	}
1170 }
1171 
1172 ///b3UnSwapVector3Endian swaps vector endianness, useful for network and cross-platform serialization
b3UnSwapVector3Endian(b3Vector3 & vector)1173 B3_FORCE_INLINE void b3UnSwapVector3Endian(b3Vector3& vector)
1174 {
1175 	b3Vector3 swappedVec;
1176 	for (int i = 0; i < 4; i++)
1177 	{
1178 		b3SwapScalarEndian(vector[i], swappedVec[i]);
1179 	}
1180 	vector = swappedVec;
1181 }
1182 
1183 template <class T>
b3PlaneSpace1(const T & n,T & p,T & q)1184 B3_FORCE_INLINE void b3PlaneSpace1(const T& n, T& p, T& q)
1185 {
1186 	if (b3Fabs(n[2]) > B3_SQRT12)
1187 	{
1188 		// choose p in y-z plane
1189 		b3Scalar a = n[1] * n[1] + n[2] * n[2];
1190 		b3Scalar k = b3RecipSqrt(a);
1191 		p[0] = 0;
1192 		p[1] = -n[2] * k;
1193 		p[2] = n[1] * k;
1194 		// set q = n x p
1195 		q[0] = a * k;
1196 		q[1] = -n[0] * p[2];
1197 		q[2] = n[0] * p[1];
1198 	}
1199 	else
1200 	{
1201 		// choose p in x-y plane
1202 		b3Scalar a = n[0] * n[0] + n[1] * n[1];
1203 		b3Scalar k = b3RecipSqrt(a);
1204 		p[0] = -n[1] * k;
1205 		p[1] = n[0] * k;
1206 		p[2] = 0;
1207 		// set q = n x p
1208 		q[0] = -n[2] * p[1];
1209 		q[1] = n[2] * p[0];
1210 		q[2] = a * k;
1211 	}
1212 }
1213 
1214 struct b3Vector3FloatData
1215 {
1216 	float m_floats[4];
1217 };
1218 
1219 struct b3Vector3DoubleData
1220 {
1221 	double m_floats[4];
1222 };
1223 
serializeFloat(struct b3Vector3FloatData & dataOut)1224 B3_FORCE_INLINE void b3Vector3::serializeFloat(struct b3Vector3FloatData& dataOut) const
1225 {
1226 	///could also do a memcpy, check if it is worth it
1227 	for (int i = 0; i < 4; i++)
1228 		dataOut.m_floats[i] = float(m_floats[i]);
1229 }
1230 
deSerializeFloat(const struct b3Vector3FloatData & dataIn)1231 B3_FORCE_INLINE void b3Vector3::deSerializeFloat(const struct b3Vector3FloatData& dataIn)
1232 {
1233 	for (int i = 0; i < 4; i++)
1234 		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
1235 }
1236 
serializeDouble(struct b3Vector3DoubleData & dataOut)1237 B3_FORCE_INLINE void b3Vector3::serializeDouble(struct b3Vector3DoubleData& dataOut) const
1238 {
1239 	///could also do a memcpy, check if it is worth it
1240 	for (int i = 0; i < 4; i++)
1241 		dataOut.m_floats[i] = double(m_floats[i]);
1242 }
1243 
deSerializeDouble(const struct b3Vector3DoubleData & dataIn)1244 B3_FORCE_INLINE void b3Vector3::deSerializeDouble(const struct b3Vector3DoubleData& dataIn)
1245 {
1246 	for (int i = 0; i < 4; i++)
1247 		m_floats[i] = b3Scalar(dataIn.m_floats[i]);
1248 }
1249 
serialize(struct b3Vector3Data & dataOut)1250 B3_FORCE_INLINE void b3Vector3::serialize(struct b3Vector3Data& dataOut) const
1251 {
1252 	///could also do a memcpy, check if it is worth it
1253 	for (int i = 0; i < 4; i++)
1254 		dataOut.m_floats[i] = m_floats[i];
1255 }
1256 
deSerialize(const struct b3Vector3Data & dataIn)1257 B3_FORCE_INLINE void b3Vector3::deSerialize(const struct b3Vector3Data& dataIn)
1258 {
1259 	for (int i = 0; i < 4; i++)
1260 		m_floats[i] = dataIn.m_floats[i];
1261 }
1262 
b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z)1263 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z)
1264 {
1265 	b3Vector3 tmp;
1266 	tmp.setValue(x, y, z);
1267 	return tmp;
1268 }
1269 
b3MakeVector3(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w)1270 inline b3Vector3 b3MakeVector3(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
1271 {
1272 	b3Vector3 tmp;
1273 	tmp.setValue(x, y, z);
1274 	tmp.w = w;
1275 	return tmp;
1276 }
1277 
b3MakeVector4(b3Scalar x,b3Scalar y,b3Scalar z,b3Scalar w)1278 inline b3Vector4 b3MakeVector4(b3Scalar x, b3Scalar y, b3Scalar z, b3Scalar w)
1279 {
1280 	b3Vector4 tmp;
1281 	tmp.setValue(x, y, z, w);
1282 	return tmp;
1283 }
1284 
1285 #if defined(B3_USE_SSE_IN_API) && defined(B3_USE_SSE)
1286 
b3MakeVector3(b3SimdFloat4 v)1287 inline b3Vector3 b3MakeVector3(b3SimdFloat4 v)
1288 {
1289 	b3Vector3 tmp;
1290 	tmp.set128(v);
1291 	return tmp;
1292 }
1293 
b3MakeVector4(b3SimdFloat4 vec)1294 inline b3Vector4 b3MakeVector4(b3SimdFloat4 vec)
1295 {
1296 	b3Vector4 tmp;
1297 	tmp.set128(vec);
1298 	return tmp;
1299 }
1300 
1301 #endif
1302 
1303 #endif  //B3_VECTOR3_H
1304