1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "../sys/alloc.h" 7 #include "math.h" 8 #include "../simd/sse.h" 9 10 namespace embree 11 { 12 //////////////////////////////////////////////////////////////////////////////// 13 /// SSE Vec3fa Type 14 //////////////////////////////////////////////////////////////////////////////// 15 16 struct __aligned(16) Vec3fa 17 { 18 ALIGNED_STRUCT_(16); 19 20 typedef float Scalar; 21 enum { N = 3 }; 22 union { 23 __m128 m128; 24 struct { float x,y,z; }; 25 }; 26 27 //////////////////////////////////////////////////////////////////////////////// 28 /// Constructors, Assignment & Cast Operators 29 //////////////////////////////////////////////////////////////////////////////// 30 Vec3faVec3fa31 __forceinline Vec3fa( ) {} Vec3faVec3fa32 __forceinline Vec3fa( const __m128 a ) : m128(a) {} 33 Vec3faVec3fa34 __forceinline Vec3fa ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } 35 //__forceinline Vec3fa& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } 36 Vec3faVec3fa37 __forceinline Vec3fa ( const Vec3fa& other ) { m128 = other.m128; } 38 __forceinline Vec3fa& operator =( const Vec3fa& other ) { m128 = other.m128; return *this; } 39 Vec3faVec3fa40 __forceinline explicit Vec3fa( const float a ) : m128(_mm_set1_ps(a)) {} Vec3faVec3fa41 __forceinline Vec3fa( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} 42 Vec3faVec3fa43 __forceinline explicit Vec3fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} 44 vfloat4Vec3fa45 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } vint4Vec3fa46 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } Vec2faVec3fa47 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } Vec3iaVec3fa48 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } 49 50 //__forceinline operator const __m128&() const { return m128; } 51 //__forceinline operator __m128&() { return m128; } 52 53 //////////////////////////////////////////////////////////////////////////////// 54 /// Loads and Stores 55 //////////////////////////////////////////////////////////////////////////////// 56 loadVec3fa57 static __forceinline Vec3fa load( const void* const a ) { 58 return Vec3fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); 59 } 60 loaduVec3fa61 static __forceinline Vec3fa loadu( const void* const a ) { 62 return Vec3fa(_mm_loadu_ps((float*)a)); 63 } 64 storeuVec3fa65 static __forceinline void storeu ( void* ptr, const Vec3fa& v ) { 66 _mm_storeu_ps((float*)ptr,v.m128); 67 } 68 69 //////////////////////////////////////////////////////////////////////////////// 70 /// Constants 71 //////////////////////////////////////////////////////////////////////////////// 72 Vec3faVec3fa73 __forceinline Vec3fa( ZeroTy ) : m128(_mm_setzero_ps()) {} Vec3faVec3fa74 __forceinline Vec3fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} Vec3faVec3fa75 __forceinline Vec3fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} Vec3faVec3fa76 __forceinline Vec3fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} 77 78 //////////////////////////////////////////////////////////////////////////////// 79 /// Array Access 80 //////////////////////////////////////////////////////////////////////////////// 81 82 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } 83 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } 84 }; 85 86 //////////////////////////////////////////////////////////////////////////////// 87 /// Unary Operators 88 //////////////////////////////////////////////////////////////////////////////// 89 90 __forceinline Vec3fa operator +( const Vec3fa& a ) { return a; } 91 __forceinline Vec3fa operator -( const Vec3fa& a ) { 92 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); 93 return _mm_xor_ps(a.m128, mask); 94 } abs(const Vec3fa & a)95 __forceinline Vec3fa abs ( const Vec3fa& a ) { 96 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); 97 return _mm_and_ps(a.m128, mask); 98 } sign(const Vec3fa & a)99 __forceinline Vec3fa sign ( const Vec3fa& a ) { 100 return blendv_ps(Vec3fa(one).m128, (-Vec3fa(one)).m128, _mm_cmplt_ps (a.m128,Vec3fa(zero).m128)); 101 } 102 rcp(const Vec3fa & a)103 __forceinline Vec3fa rcp ( const Vec3fa& a ) 104 { 105 #if defined(__AVX512VL__) 106 const Vec3fa r = _mm_rcp14_ps(a.m128); 107 #else 108 const Vec3fa r = _mm_rcp_ps(a.m128); 109 #endif 110 111 #if defined(__AVX2__) 112 const Vec3fa h_n = _mm_fnmadd_ps(a.m128, r.m128, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) 113 const Vec3fa res = _mm_fmadd_ps(r.m128, h_n.m128, r.m128); // Then compute r + r * h_n 114 #else 115 const Vec3fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a.m128, r.m128)); // First, compute 1 - a * r (which will be very close to 0) 116 const Vec3fa res = _mm_add_ps(r.m128,_mm_mul_ps(r.m128, h_n.m128)); // Then compute r + r * h_n 117 #endif 118 119 return res; 120 } 121 sqrt(const Vec3fa & a)122 __forceinline Vec3fa sqrt ( const Vec3fa& a ) { return _mm_sqrt_ps(a.m128); } sqr(const Vec3fa & a)123 __forceinline Vec3fa sqr ( const Vec3fa& a ) { return _mm_mul_ps(a.m128,a.m128); } 124 rsqrt(const Vec3fa & a)125 __forceinline Vec3fa rsqrt( const Vec3fa& a ) 126 { 127 #if defined(__AVX512VL__) 128 __m128 r = _mm_rsqrt14_ps(a.m128); 129 #else 130 __m128 r = _mm_rsqrt_ps(a.m128); 131 #endif 132 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); 133 } 134 zero_fix(const Vec3fa & a)135 __forceinline Vec3fa zero_fix(const Vec3fa& a) { 136 return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); 137 } rcp_safe(const Vec3fa & a)138 __forceinline Vec3fa rcp_safe(const Vec3fa& a) { 139 return rcp(zero_fix(a)); 140 } log(const Vec3fa & a)141 __forceinline Vec3fa log ( const Vec3fa& a ) { 142 return Vec3fa(logf(a.x),logf(a.y),logf(a.z)); 143 } 144 exp(const Vec3fa & a)145 __forceinline Vec3fa exp ( const Vec3fa& a ) { 146 return Vec3fa(expf(a.x),expf(a.y),expf(a.z)); 147 } 148 149 //////////////////////////////////////////////////////////////////////////////// 150 /// Binary Operators 151 //////////////////////////////////////////////////////////////////////////////// 152 153 __forceinline Vec3fa operator +( const Vec3fa& a, const Vec3fa& b ) { return _mm_add_ps(a.m128, b.m128); } 154 __forceinline Vec3fa operator -( const Vec3fa& a, const Vec3fa& b ) { return _mm_sub_ps(a.m128, b.m128); } 155 __forceinline Vec3fa operator *( const Vec3fa& a, const Vec3fa& b ) { return _mm_mul_ps(a.m128, b.m128); } 156 __forceinline Vec3fa operator *( const Vec3fa& a, const float b ) { return a * Vec3fa(b); } 157 __forceinline Vec3fa operator *( const float a, const Vec3fa& b ) { return Vec3fa(a) * b; } 158 __forceinline Vec3fa operator /( const Vec3fa& a, const Vec3fa& b ) { return _mm_div_ps(a.m128,b.m128); } 159 __forceinline Vec3fa operator /( const Vec3fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } 160 __forceinline Vec3fa operator /( const float a, const Vec3fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } 161 min(const Vec3fa & a,const Vec3fa & b)162 __forceinline Vec3fa min( const Vec3fa& a, const Vec3fa& b ) { return _mm_min_ps(a.m128,b.m128); } max(const Vec3fa & a,const Vec3fa & b)163 __forceinline Vec3fa max( const Vec3fa& a, const Vec3fa& b ) { return _mm_max_ps(a.m128,b.m128); } 164 165 #if defined(__SSE4_1__) mini(const Vec3fa & a,const Vec3fa & b)166 __forceinline Vec3fa mini(const Vec3fa& a, const Vec3fa& b) { 167 const vint4 ai = _mm_castps_si128(a.m128); 168 const vint4 bi = _mm_castps_si128(b.m128); 169 const vint4 ci = _mm_min_epi32(ai,bi); 170 return _mm_castsi128_ps(ci); 171 } 172 #endif 173 174 #if defined(__SSE4_1__) maxi(const Vec3fa & a,const Vec3fa & b)175 __forceinline Vec3fa maxi(const Vec3fa& a, const Vec3fa& b) { 176 const vint4 ai = _mm_castps_si128(a.m128); 177 const vint4 bi = _mm_castps_si128(b.m128); 178 const vint4 ci = _mm_max_epi32(ai,bi); 179 return _mm_castsi128_ps(ci); 180 } 181 #endif 182 pow(const Vec3fa & a,const float & b)183 __forceinline Vec3fa pow ( const Vec3fa& a, const float& b ) { 184 return Vec3fa(powf(a.x,b),powf(a.y,b),powf(a.z,b)); 185 } 186 187 //////////////////////////////////////////////////////////////////////////////// 188 /// Ternary Operators 189 //////////////////////////////////////////////////////////////////////////////// 190 191 #if defined(__AVX2__) madd(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)192 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } msub(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)193 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } nmadd(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)194 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } nmsub(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)195 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } 196 #else madd(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)197 __forceinline Vec3fa madd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b+c; } msub(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)198 __forceinline Vec3fa msub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return a*b-c; } nmadd(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)199 __forceinline Vec3fa nmadd ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b+c;} nmsub(const Vec3fa & a,const Vec3fa & b,const Vec3fa & c)200 __forceinline Vec3fa nmsub ( const Vec3fa& a, const Vec3fa& b, const Vec3fa& c) { return -a*b-c; } 201 #endif 202 madd(const float a,const Vec3fa & b,const Vec3fa & c)203 __forceinline Vec3fa madd ( const float a, const Vec3fa& b, const Vec3fa& c) { return madd(Vec3fa(a),b,c); } msub(const float a,const Vec3fa & b,const Vec3fa & c)204 __forceinline Vec3fa msub ( const float a, const Vec3fa& b, const Vec3fa& c) { return msub(Vec3fa(a),b,c); } nmadd(const float a,const Vec3fa & b,const Vec3fa & c)205 __forceinline Vec3fa nmadd ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmadd(Vec3fa(a),b,c); } nmsub(const float a,const Vec3fa & b,const Vec3fa & c)206 __forceinline Vec3fa nmsub ( const float a, const Vec3fa& b, const Vec3fa& c) { return nmsub(Vec3fa(a),b,c); } 207 208 //////////////////////////////////////////////////////////////////////////////// 209 /// Assignment Operators 210 //////////////////////////////////////////////////////////////////////////////// 211 212 __forceinline Vec3fa& operator +=( Vec3fa& a, const Vec3fa& b ) { return a = a + b; } 213 __forceinline Vec3fa& operator -=( Vec3fa& a, const Vec3fa& b ) { return a = a - b; } 214 __forceinline Vec3fa& operator *=( Vec3fa& a, const Vec3fa& b ) { return a = a * b; } 215 __forceinline Vec3fa& operator *=( Vec3fa& a, const float b ) { return a = a * b; } 216 __forceinline Vec3fa& operator /=( Vec3fa& a, const Vec3fa& b ) { return a = a / b; } 217 __forceinline Vec3fa& operator /=( Vec3fa& a, const float b ) { return a = a / b; } 218 219 //////////////////////////////////////////////////////////////////////////////// 220 /// Reductions 221 //////////////////////////////////////////////////////////////////////////////// 222 reduce_add(const Vec3fa & v)223 __forceinline float reduce_add(const Vec3fa& v) { 224 const vfloat4 a(v.m128); 225 const vfloat4 b = shuffle<1>(a); 226 const vfloat4 c = shuffle<2>(a); 227 return _mm_cvtss_f32(a+b+c); 228 } 229 reduce_mul(const Vec3fa & v)230 __forceinline float reduce_mul(const Vec3fa& v) { return v.x*v.y*v.z; } reduce_min(const Vec3fa & v)231 __forceinline float reduce_min(const Vec3fa& v) { return min(v.x,v.y,v.z); } reduce_max(const Vec3fa & v)232 __forceinline float reduce_max(const Vec3fa& v) { return max(v.x,v.y,v.z); } 233 234 //////////////////////////////////////////////////////////////////////////////// 235 /// Comparison Operators 236 //////////////////////////////////////////////////////////////////////////////// 237 238 __forceinline bool operator ==( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } 239 __forceinline bool operator !=( const Vec3fa& a, const Vec3fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } 240 eq_mask(const Vec3fa & a,const Vec3fa & b)241 __forceinline Vec3ba eq_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } neq_mask(const Vec3fa & a,const Vec3fa & b)242 __forceinline Vec3ba neq_mask(const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } lt_mask(const Vec3fa & a,const Vec3fa & b)243 __forceinline Vec3ba lt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmplt_ps (a.m128, b.m128); } le_mask(const Vec3fa & a,const Vec3fa & b)244 __forceinline Vec3ba le_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmple_ps (a.m128, b.m128); } gt_mask(const Vec3fa & a,const Vec3fa & b)245 __forceinline Vec3ba gt_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } ge_mask(const Vec3fa & a,const Vec3fa & b)246 __forceinline Vec3ba ge_mask( const Vec3fa& a, const Vec3fa& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } 247 isvalid(const Vec3fa & v)248 __forceinline bool isvalid ( const Vec3fa& v ) { 249 return all(gt_mask(v,Vec3fa(-FLT_LARGE)) & lt_mask(v,Vec3fa(+FLT_LARGE))); 250 } 251 is_finite(const Vec3fa & a)252 __forceinline bool is_finite ( const Vec3fa& a ) { 253 return all(ge_mask(a,Vec3fa(-FLT_MAX)) & le_mask(a,Vec3fa(+FLT_MAX))); 254 } 255 isvalid4(const Vec3fa & v)256 __forceinline bool isvalid4 ( const Vec3fa& v ) { 257 return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); 258 } 259 is_finite4(const Vec3fa & a)260 __forceinline bool is_finite4 ( const Vec3fa& a ) { 261 return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); 262 } 263 264 //////////////////////////////////////////////////////////////////////////////// 265 /// Euclidian Space Operators 266 //////////////////////////////////////////////////////////////////////////////// 267 268 #if defined(__SSE4_1__) dot(const Vec3fa & a,const Vec3fa & b)269 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { 270 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); 271 } 272 #else dot(const Vec3fa & a,const Vec3fa & b)273 __forceinline float dot ( const Vec3fa& a, const Vec3fa& b ) { 274 return reduce_add(a*b); 275 } 276 #endif 277 cross(const Vec3fa & a,const Vec3fa & b)278 __forceinline Vec3fa cross ( const Vec3fa& a, const Vec3fa& b ) 279 { 280 vfloat4 a0 = vfloat4(a.m128); 281 vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); 282 vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); 283 vfloat4 b1 = vfloat4(b.m128); 284 return Vec3fa(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); 285 } 286 sqr_length(const Vec3fa & a)287 __forceinline float sqr_length ( const Vec3fa& a ) { return dot(a,a); } rcp_length(const Vec3fa & a)288 __forceinline float rcp_length ( const Vec3fa& a ) { return rsqrt(dot(a,a)); } rcp_length2(const Vec3fa & a)289 __forceinline float rcp_length2( const Vec3fa& a ) { return rcp(dot(a,a)); } length(const Vec3fa & a)290 __forceinline float length ( const Vec3fa& a ) { return sqrt(dot(a,a)); } normalize(const Vec3fa & a)291 __forceinline Vec3fa normalize( const Vec3fa& a ) { return a*rsqrt(dot(a,a)); } distance(const Vec3fa & a,const Vec3fa & b)292 __forceinline float distance ( const Vec3fa& a, const Vec3fa& b ) { return length(a-b); } halfArea(const Vec3fa & d)293 __forceinline float halfArea ( const Vec3fa& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } area(const Vec3fa & d)294 __forceinline float area ( const Vec3fa& d ) { return 2.0f*halfArea(d); } 295 normalize_safe(const Vec3fa & a)296 __forceinline Vec3fa normalize_safe( const Vec3fa& a ) { 297 const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); 298 } 299 300 /*! differentiated normalization */ dnormalize(const Vec3fa & p,const Vec3fa & dp)301 __forceinline Vec3fa dnormalize(const Vec3fa& p, const Vec3fa& dp) 302 { 303 const float pp = dot(p,p); 304 const float pdp = dot(p,dp); 305 return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); 306 } 307 308 //////////////////////////////////////////////////////////////////////////////// 309 /// Select 310 //////////////////////////////////////////////////////////////////////////////// 311 select(bool s,const Vec3fa & t,const Vec3fa & f)312 __forceinline Vec3fa select( bool s, const Vec3fa& t, const Vec3fa& f ) { 313 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); 314 return blendv_ps(f.m128, t.m128, mask); 315 } 316 select(const Vec3ba & s,const Vec3fa & t,const Vec3fa & f)317 __forceinline Vec3fa select( const Vec3ba& s, const Vec3fa& t, const Vec3fa& f ) { 318 return blendv_ps(f.m128, t.m128, s); 319 } 320 lerp(const Vec3fa & v0,const Vec3fa & v1,const float t)321 __forceinline Vec3fa lerp(const Vec3fa& v0, const Vec3fa& v1, const float t) { 322 return madd(1.0f-t,v0,t*v1); 323 } 324 maxDim(const Vec3fa & a)325 __forceinline int maxDim ( const Vec3fa& a ) 326 { 327 const Vec3fa b = abs(a); 328 if (b.x > b.y) { 329 if (b.x > b.z) return 0; else return 2; 330 } else { 331 if (b.y > b.z) return 1; else return 2; 332 } 333 } 334 335 //////////////////////////////////////////////////////////////////////////////// 336 /// Rounding Functions 337 //////////////////////////////////////////////////////////////////////////////// 338 339 #if defined (__SSE4_1__) trunc(const Vec3fa & a)340 __forceinline Vec3fa trunc( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } floor(const Vec3fa & a)341 __forceinline Vec3fa floor( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } ceil(const Vec3fa & a)342 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } 343 #else trunc(const Vec3fa & a)344 __forceinline Vec3fa trunc( const Vec3fa& a ) { return Vec3fa(truncf(a.x),truncf(a.y),truncf(a.z)); } floor(const Vec3fa & a)345 __forceinline Vec3fa floor( const Vec3fa& a ) { return Vec3fa(floorf(a.x),floorf(a.y),floorf(a.z)); } ceil(const Vec3fa & a)346 __forceinline Vec3fa ceil ( const Vec3fa& a ) { return Vec3fa(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } 347 #endif 348 349 //////////////////////////////////////////////////////////////////////////////// 350 /// Output Operators 351 //////////////////////////////////////////////////////////////////////////////// 352 353 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fa& a) { 354 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; 355 } 356 357 typedef Vec3fa Vec3fa_t; 358 359 360 //////////////////////////////////////////////////////////////////////////////// 361 /// SSE Vec3fx Type 362 //////////////////////////////////////////////////////////////////////////////// 363 364 struct __aligned(16) Vec3fx 365 { 366 ALIGNED_STRUCT_(16); 367 368 typedef float Scalar; 369 enum { N = 3 }; 370 union { 371 __m128 m128; 372 struct { float x,y,z; union { int a; unsigned u; float w; }; }; 373 }; 374 375 //////////////////////////////////////////////////////////////////////////////// 376 /// Constructors, Assignment & Cast Operators 377 //////////////////////////////////////////////////////////////////////////////// 378 Vec3fxVec3fx379 __forceinline Vec3fx( ) {} Vec3fxVec3fx380 __forceinline Vec3fx( const __m128 a ) : m128(a) {} 381 Vec3fxVec3fx382 __forceinline explicit Vec3fx(const Vec3fa& v) : m128(v.m128) {} Vec3faVec3fx383 __forceinline operator Vec3fa () const { return Vec3fa(m128); } 384 Vec3fxVec3fx385 __forceinline explicit Vec3fx ( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); } 386 //__forceinline Vec3fx& operator =( const Vec3<float>& other ) { m128 = _mm_set_ps(0, other.z, other.y, other.x); return *this; } 387 Vec3fxVec3fx388 __forceinline Vec3fx ( const Vec3fx& other ) { m128 = other.m128; } 389 390 __forceinline Vec3fx& operator =( const Vec3fx& other ) { m128 = other.m128; return *this; } 391 Vec3fxVec3fx392 __forceinline explicit Vec3fx( const float a ) : m128(_mm_set1_ps(a)) {} Vec3fxVec3fx393 __forceinline Vec3fx( const float x, const float y, const float z) : m128(_mm_set_ps(0, z, y, x)) {} 394 Vec3fxVec3fx395 __forceinline Vec3fx( const Vec3fa& other, const int a1) { m128 = other.m128; a = a1; } Vec3fxVec3fx396 __forceinline Vec3fx( const Vec3fa& other, const unsigned a1) { m128 = other.m128; u = a1; } Vec3fxVec3fx397 __forceinline Vec3fx( const Vec3fa& other, const float w1) { 398 #if defined (__SSE4_1__) 399 m128 = _mm_insert_ps(other.m128, _mm_set_ss(w1),3 << 4); 400 #else 401 const vint4 mask(-1,-1,-1,0); 402 m128 = select(vboolf4(_mm_castsi128_ps(mask)),vfloat4(other.m128),vfloat4(w1)); 403 #endif 404 } 405 //__forceinline Vec3fx( const float x, const float y, const float z, const int a) : x(x), y(y), z(z), a(a) {} // not working properly! 406 //__forceinline Vec3fx( const float x, const float y, const float z, const unsigned a) : x(x), y(y), z(z), u(a) {} // not working properly! Vec3fxVec3fx407 __forceinline Vec3fx( const float x, const float y, const float z, const float w) : m128(_mm_set_ps(w, z, y, x)) {} 408 409 //__forceinline explicit Vec3fx( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} 410 vfloat4Vec3fx411 __forceinline explicit operator const vfloat4() const { return vfloat4(m128); } vint4Vec3fx412 __forceinline explicit operator const vint4() const { return vint4(_mm_cvtps_epi32(m128)); } Vec2faVec3fx413 __forceinline explicit operator const Vec2fa() const { return Vec2fa(m128); } Vec3iaVec3fx414 __forceinline explicit operator const Vec3ia() const { return Vec3ia(_mm_cvtps_epi32(m128)); } 415 416 //__forceinline operator const __m128&() const { return m128; } 417 //__forceinline operator __m128&() { return m128; } 418 419 //////////////////////////////////////////////////////////////////////////////// 420 /// Loads and Stores 421 //////////////////////////////////////////////////////////////////////////////// 422 loadVec3fx423 static __forceinline Vec3fx load( const void* const a ) { 424 return Vec3fx(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, -1, -1, -1)))); 425 } 426 loaduVec3fx427 static __forceinline Vec3fx loadu( const void* const a ) { 428 return Vec3fx(_mm_loadu_ps((float*)a)); 429 } 430 storeuVec3fx431 static __forceinline void storeu ( void* ptr, const Vec3fx& v ) { 432 _mm_storeu_ps((float*)ptr,v.m128); 433 } 434 435 //////////////////////////////////////////////////////////////////////////////// 436 /// Constants 437 //////////////////////////////////////////////////////////////////////////////// 438 Vec3fxVec3fx439 __forceinline Vec3fx( ZeroTy ) : m128(_mm_setzero_ps()) {} Vec3fxVec3fx440 __forceinline Vec3fx( OneTy ) : m128(_mm_set1_ps(1.0f)) {} Vec3fxVec3fx441 __forceinline Vec3fx( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} Vec3fxVec3fx442 __forceinline Vec3fx( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} 443 444 //////////////////////////////////////////////////////////////////////////////// 445 /// Array Access 446 //////////////////////////////////////////////////////////////////////////////// 447 448 __forceinline const float& operator []( const size_t index ) const { assert(index < 3); return (&x)[index]; } 449 __forceinline float& operator []( const size_t index ) { assert(index < 3); return (&x)[index]; } 450 }; 451 452 //////////////////////////////////////////////////////////////////////////////// 453 /// Unary Operators 454 //////////////////////////////////////////////////////////////////////////////// 455 456 __forceinline Vec3fx operator +( const Vec3fx& a ) { return a; } 457 __forceinline Vec3fx operator -( const Vec3fx& a ) { 458 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); 459 return _mm_xor_ps(a.m128, mask); 460 } abs(const Vec3fx & a)461 __forceinline Vec3fx abs ( const Vec3fx& a ) { 462 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); 463 return _mm_and_ps(a.m128, mask); 464 } sign(const Vec3fx & a)465 __forceinline Vec3fx sign ( const Vec3fx& a ) { 466 return blendv_ps(Vec3fx(one).m128, (-Vec3fx(one)).m128, _mm_cmplt_ps (a.m128,Vec3fx(zero).m128)); 467 } 468 rcp(const Vec3fx & a)469 __forceinline Vec3fx rcp ( const Vec3fx& a ) 470 { 471 #if defined(__AVX512VL__) 472 const Vec3fx r = _mm_rcp14_ps(a.m128); 473 #else 474 const Vec3fx r = _mm_rcp_ps(a.m128); 475 #endif 476 477 #if defined(__AVX2__) 478 const Vec3fx res = _mm_mul_ps(r.m128,_mm_fnmadd_ps(r.m128, a.m128, vfloat4(2.0f))); 479 #else 480 const Vec3fx res = _mm_mul_ps(r.m128,_mm_sub_ps(vfloat4(2.0f), _mm_mul_ps(r.m128, a.m128))); 481 //return _mm_sub_ps(_mm_add_ps(r, r), _mm_mul_ps(_mm_mul_ps(r, r), a)); 482 #endif 483 484 return res; 485 } 486 sqrt(const Vec3fx & a)487 __forceinline Vec3fx sqrt ( const Vec3fx& a ) { return _mm_sqrt_ps(a.m128); } sqr(const Vec3fx & a)488 __forceinline Vec3fx sqr ( const Vec3fx& a ) { return _mm_mul_ps(a.m128,a.m128); } 489 rsqrt(const Vec3fx & a)490 __forceinline Vec3fx rsqrt( const Vec3fx& a ) 491 { 492 #if defined(__AVX512VL__) 493 __m128 r = _mm_rsqrt14_ps(a.m128); 494 #else 495 __m128 r = _mm_rsqrt_ps(a.m128); 496 #endif 497 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a.m128, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); 498 } 499 zero_fix(const Vec3fx & a)500 __forceinline Vec3fx zero_fix(const Vec3fx& a) { 501 return blendv_ps(a.m128, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); 502 } rcp_safe(const Vec3fx & a)503 __forceinline Vec3fx rcp_safe(const Vec3fx& a) { 504 return rcp(zero_fix(a)); 505 } log(const Vec3fx & a)506 __forceinline Vec3fx log ( const Vec3fx& a ) { 507 return Vec3fx(logf(a.x),logf(a.y),logf(a.z)); 508 } 509 exp(const Vec3fx & a)510 __forceinline Vec3fx exp ( const Vec3fx& a ) { 511 return Vec3fx(expf(a.x),expf(a.y),expf(a.z)); 512 } 513 514 //////////////////////////////////////////////////////////////////////////////// 515 /// Binary Operators 516 //////////////////////////////////////////////////////////////////////////////// 517 518 __forceinline Vec3fx operator +( const Vec3fx& a, const Vec3fx& b ) { return _mm_add_ps(a.m128, b.m128); } 519 __forceinline Vec3fx operator -( const Vec3fx& a, const Vec3fx& b ) { return _mm_sub_ps(a.m128, b.m128); } 520 __forceinline Vec3fx operator *( const Vec3fx& a, const Vec3fx& b ) { return _mm_mul_ps(a.m128, b.m128); } 521 __forceinline Vec3fx operator *( const Vec3fx& a, const float b ) { return a * Vec3fx(b); } 522 __forceinline Vec3fx operator *( const float a, const Vec3fx& b ) { return Vec3fx(a) * b; } 523 __forceinline Vec3fx operator /( const Vec3fx& a, const Vec3fx& b ) { return _mm_div_ps(a.m128,b.m128); } 524 __forceinline Vec3fx operator /( const Vec3fx& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } 525 __forceinline Vec3fx operator /( const float a, const Vec3fx& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } 526 min(const Vec3fx & a,const Vec3fx & b)527 __forceinline Vec3fx min( const Vec3fx& a, const Vec3fx& b ) { return _mm_min_ps(a.m128,b.m128); } max(const Vec3fx & a,const Vec3fx & b)528 __forceinline Vec3fx max( const Vec3fx& a, const Vec3fx& b ) { return _mm_max_ps(a.m128,b.m128); } 529 530 #if defined(__SSE4_1__) mini(const Vec3fx & a,const Vec3fx & b)531 __forceinline Vec3fx mini(const Vec3fx& a, const Vec3fx& b) { 532 const vint4 ai = _mm_castps_si128(a.m128); 533 const vint4 bi = _mm_castps_si128(b.m128); 534 const vint4 ci = _mm_min_epi32(ai,bi); 535 return _mm_castsi128_ps(ci); 536 } 537 #endif 538 539 #if defined(__SSE4_1__) maxi(const Vec3fx & a,const Vec3fx & b)540 __forceinline Vec3fx maxi(const Vec3fx& a, const Vec3fx& b) { 541 const vint4 ai = _mm_castps_si128(a.m128); 542 const vint4 bi = _mm_castps_si128(b.m128); 543 const vint4 ci = _mm_max_epi32(ai,bi); 544 return _mm_castsi128_ps(ci); 545 } 546 #endif 547 pow(const Vec3fx & a,const float & b)548 __forceinline Vec3fx pow ( const Vec3fx& a, const float& b ) { 549 return Vec3fx(powf(a.x,b),powf(a.y,b),powf(a.z,b)); 550 } 551 552 //////////////////////////////////////////////////////////////////////////////// 553 /// Ternary Operators 554 //////////////////////////////////////////////////////////////////////////////// 555 556 #if defined(__AVX2__) madd(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)557 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmadd_ps(a.m128,b.m128,c.m128); } msub(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)558 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fmsub_ps(a.m128,b.m128,c.m128); } nmadd(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)559 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmadd_ps(a.m128,b.m128,c.m128); } nmsub(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)560 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return _mm_fnmsub_ps(a.m128,b.m128,c.m128); } 561 #else madd(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)562 __forceinline Vec3fx madd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b+c; } msub(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)563 __forceinline Vec3fx msub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return a*b-c; } nmadd(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)564 __forceinline Vec3fx nmadd ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b+c;} nmsub(const Vec3fx & a,const Vec3fx & b,const Vec3fx & c)565 __forceinline Vec3fx nmsub ( const Vec3fx& a, const Vec3fx& b, const Vec3fx& c) { return -a*b-c; } 566 #endif 567 madd(const float a,const Vec3fx & b,const Vec3fx & c)568 __forceinline Vec3fx madd ( const float a, const Vec3fx& b, const Vec3fx& c) { return madd(Vec3fx(a),b,c); } msub(const float a,const Vec3fx & b,const Vec3fx & c)569 __forceinline Vec3fx msub ( const float a, const Vec3fx& b, const Vec3fx& c) { return msub(Vec3fx(a),b,c); } nmadd(const float a,const Vec3fx & b,const Vec3fx & c)570 __forceinline Vec3fx nmadd ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmadd(Vec3fx(a),b,c); } nmsub(const float a,const Vec3fx & b,const Vec3fx & c)571 __forceinline Vec3fx nmsub ( const float a, const Vec3fx& b, const Vec3fx& c) { return nmsub(Vec3fx(a),b,c); } 572 573 //////////////////////////////////////////////////////////////////////////////// 574 /// Assignment Operators 575 //////////////////////////////////////////////////////////////////////////////// 576 577 __forceinline Vec3fx& operator +=( Vec3fx& a, const Vec3fx& b ) { return a = a + b; } 578 __forceinline Vec3fx& operator -=( Vec3fx& a, const Vec3fx& b ) { return a = a - b; } 579 __forceinline Vec3fx& operator *=( Vec3fx& a, const Vec3fx& b ) { return a = a * b; } 580 __forceinline Vec3fx& operator *=( Vec3fx& a, const float b ) { return a = a * b; } 581 __forceinline Vec3fx& operator /=( Vec3fx& a, const Vec3fx& b ) { return a = a / b; } 582 __forceinline Vec3fx& operator /=( Vec3fx& a, const float b ) { return a = a / b; } 583 584 //////////////////////////////////////////////////////////////////////////////// 585 /// Reductions 586 //////////////////////////////////////////////////////////////////////////////// 587 reduce_add(const Vec3fx & v)588 __forceinline float reduce_add(const Vec3fx& v) { 589 const vfloat4 a(v.m128); 590 const vfloat4 b = shuffle<1>(a); 591 const vfloat4 c = shuffle<2>(a); 592 return _mm_cvtss_f32(a+b+c); 593 } 594 reduce_mul(const Vec3fx & v)595 __forceinline float reduce_mul(const Vec3fx& v) { return v.x*v.y*v.z; } reduce_min(const Vec3fx & v)596 __forceinline float reduce_min(const Vec3fx& v) { return min(v.x,v.y,v.z); } reduce_max(const Vec3fx & v)597 __forceinline float reduce_max(const Vec3fx& v) { return max(v.x,v.y,v.z); } 598 599 //////////////////////////////////////////////////////////////////////////////// 600 /// Comparison Operators 601 //////////////////////////////////////////////////////////////////////////////// 602 603 __forceinline bool operator ==( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 7) == 7; } 604 __forceinline bool operator !=( const Vec3fx& a, const Vec3fx& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 7) != 0; } 605 eq_mask(const Vec3fx & a,const Vec3fx & b)606 __forceinline Vec3ba eq_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpeq_ps (a.m128, b.m128); } neq_mask(const Vec3fx & a,const Vec3fx & b)607 __forceinline Vec3ba neq_mask(const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpneq_ps(a.m128, b.m128); } lt_mask(const Vec3fx & a,const Vec3fx & b)608 __forceinline Vec3ba lt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmplt_ps (a.m128, b.m128); } le_mask(const Vec3fx & a,const Vec3fx & b)609 __forceinline Vec3ba le_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmple_ps (a.m128, b.m128); } gt_mask(const Vec3fx & a,const Vec3fx & b)610 __forceinline Vec3ba gt_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnle_ps(a.m128, b.m128); } ge_mask(const Vec3fx & a,const Vec3fx & b)611 __forceinline Vec3ba ge_mask( const Vec3fx& a, const Vec3fx& b ) { return _mm_cmpnlt_ps(a.m128, b.m128); } 612 isvalid(const Vec3fx & v)613 __forceinline bool isvalid ( const Vec3fx& v ) { 614 return all(gt_mask(v,Vec3fx(-FLT_LARGE)) & lt_mask(v,Vec3fx(+FLT_LARGE))); 615 } 616 is_finite(const Vec3fx & a)617 __forceinline bool is_finite ( const Vec3fx& a ) { 618 return all(ge_mask(a,Vec3fx(-FLT_MAX)) & le_mask(a,Vec3fx(+FLT_MAX))); 619 } 620 isvalid4(const Vec3fx & v)621 __forceinline bool isvalid4 ( const Vec3fx& v ) { 622 return all((vfloat4(v.m128) > vfloat4(-FLT_LARGE)) & (vfloat4(v.m128) < vfloat4(+FLT_LARGE))); 623 } 624 is_finite4(const Vec3fx & a)625 __forceinline bool is_finite4 ( const Vec3fx& a ) { 626 return all((vfloat4(a.m128) >= vfloat4(-FLT_MAX)) & (vfloat4(a.m128) <= vfloat4(+FLT_MAX))); 627 } 628 629 //////////////////////////////////////////////////////////////////////////////// 630 /// Euclidian Space Operators 631 //////////////////////////////////////////////////////////////////////////////// 632 633 #if defined(__SSE4_1__) dot(const Vec3fx & a,const Vec3fx & b)634 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { 635 return _mm_cvtss_f32(_mm_dp_ps(a.m128,b.m128,0x7F)); 636 } 637 #else dot(const Vec3fx & a,const Vec3fx & b)638 __forceinline float dot ( const Vec3fx& a, const Vec3fx& b ) { 639 return reduce_add(a*b); 640 } 641 #endif 642 cross(const Vec3fx & a,const Vec3fx & b)643 __forceinline Vec3fx cross ( const Vec3fx& a, const Vec3fx& b ) 644 { 645 vfloat4 a0 = vfloat4(a.m128); 646 vfloat4 b0 = shuffle<1,2,0,3>(vfloat4(b.m128)); 647 vfloat4 a1 = shuffle<1,2,0,3>(vfloat4(a.m128)); 648 vfloat4 b1 = vfloat4(b.m128); 649 return Vec3fx(shuffle<1,2,0,3>(msub(a0,b0,a1*b1))); 650 } 651 sqr_length(const Vec3fx & a)652 __forceinline float sqr_length ( const Vec3fx& a ) { return dot(a,a); } rcp_length(const Vec3fx & a)653 __forceinline float rcp_length ( const Vec3fx& a ) { return rsqrt(dot(a,a)); } rcp_length2(const Vec3fx & a)654 __forceinline float rcp_length2( const Vec3fx& a ) { return rcp(dot(a,a)); } length(const Vec3fx & a)655 __forceinline float length ( const Vec3fx& a ) { return sqrt(dot(a,a)); } normalize(const Vec3fx & a)656 __forceinline Vec3fx normalize( const Vec3fx& a ) { return a*rsqrt(dot(a,a)); } distance(const Vec3fx & a,const Vec3fx & b)657 __forceinline float distance ( const Vec3fx& a, const Vec3fx& b ) { return length(a-b); } halfArea(const Vec3fx & d)658 __forceinline float halfArea ( const Vec3fx& d ) { return madd(d.x,(d.y+d.z),d.y*d.z); } area(const Vec3fx & d)659 __forceinline float area ( const Vec3fx& d ) { return 2.0f*halfArea(d); } 660 normalize_safe(const Vec3fx & a)661 __forceinline Vec3fx normalize_safe( const Vec3fx& a ) { 662 const float d = dot(a,a); if (unlikely(d == 0.0f)) return a; else return a*rsqrt(d); 663 } 664 665 /*! differentiated normalization */ dnormalize(const Vec3fx & p,const Vec3fx & dp)666 __forceinline Vec3fx dnormalize(const Vec3fx& p, const Vec3fx& dp) 667 { 668 const float pp = dot(p,p); 669 const float pdp = dot(p,dp); 670 return (pp*dp-pdp*p)*rcp(pp)*rsqrt(pp); 671 } 672 673 //////////////////////////////////////////////////////////////////////////////// 674 /// Select 675 //////////////////////////////////////////////////////////////////////////////// 676 select(bool s,const Vec3fx & t,const Vec3fx & f)677 __forceinline Vec3fx select( bool s, const Vec3fx& t, const Vec3fx& f ) { 678 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); 679 return blendv_ps(f.m128, t.m128, mask); 680 } 681 select(const Vec3ba & s,const Vec3fx & t,const Vec3fx & f)682 __forceinline Vec3fx select( const Vec3ba& s, const Vec3fx& t, const Vec3fx& f ) { 683 return blendv_ps(f.m128, t.m128, s); 684 } 685 lerp(const Vec3fx & v0,const Vec3fx & v1,const float t)686 __forceinline Vec3fx lerp(const Vec3fx& v0, const Vec3fx& v1, const float t) { 687 return madd(1.0f-t,v0,t*v1); 688 } 689 maxDim(const Vec3fx & a)690 __forceinline int maxDim ( const Vec3fx& a ) 691 { 692 const Vec3fx b = abs(a); 693 if (b.x > b.y) { 694 if (b.x > b.z) return 0; else return 2; 695 } else { 696 if (b.y > b.z) return 1; else return 2; 697 } 698 } 699 700 //////////////////////////////////////////////////////////////////////////////// 701 /// Rounding Functions 702 //////////////////////////////////////////////////////////////////////////////// 703 704 #if defined(__aarch64__) trunc(const Vec3fx & a)705 __forceinline Vec3fx trunc(const Vec3fx& a) { return vrndq_f32(a.m128); } floor(const Vec3fx & a)706 __forceinline Vec3fx floor(const Vec3fx& a) { return vrndmq_f32(a.m128); } ceil(const Vec3fx & a)707 __forceinline Vec3fx ceil (const Vec3fx& a) { return vrndpq_f32(a.m128); } 708 #elif defined (__SSE4_1__) trunc(const Vec3fx & a)709 __forceinline Vec3fx trunc( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEAREST_INT); } floor(const Vec3fx & a)710 __forceinline Vec3fx floor( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_NEG_INF ); } ceil(const Vec3fx & a)711 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return _mm_round_ps(a.m128, _MM_FROUND_TO_POS_INF ); } 712 #else trunc(const Vec3fx & a)713 __forceinline Vec3fx trunc( const Vec3fx& a ) { return Vec3fx(truncf(a.x),truncf(a.y),truncf(a.z)); } floor(const Vec3fx & a)714 __forceinline Vec3fx floor( const Vec3fx& a ) { return Vec3fx(floorf(a.x),floorf(a.y),floorf(a.z)); } ceil(const Vec3fx & a)715 __forceinline Vec3fx ceil ( const Vec3fx& a ) { return Vec3fx(ceilf (a.x),ceilf (a.y),ceilf (a.z)); } 716 #endif 717 718 //////////////////////////////////////////////////////////////////////////////// 719 /// Output Operators 720 //////////////////////////////////////////////////////////////////////////////// 721 722 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec3fx& a) { 723 return cout << "(" << a.x << ", " << a.y << ", " << a.z << ")"; 724 } 725 726 727 typedef Vec3fx Vec3ff; 728 } 729