1 // Copyright 2009-2021 Intel Corporation 2 // SPDX-License-Identifier: Apache-2.0 3 4 #pragma once 5 6 #include "../sys/alloc.h" 7 #include "math.h" 8 #include "../simd/sse.h" 9 10 namespace embree 11 { 12 //////////////////////////////////////////////////////////////////////////////// 13 /// SSE Vec2fa Type 14 //////////////////////////////////////////////////////////////////////////////// 15 16 struct __aligned(16) Vec2fa 17 { 18 ALIGNED_STRUCT_(16); 19 20 typedef float Scalar; 21 enum { N = 2 }; 22 union { 23 __m128 m128; 24 struct { float x,y,az,aw; }; 25 }; 26 27 //////////////////////////////////////////////////////////////////////////////// 28 /// Constructors, Assignment & Cast Operators 29 //////////////////////////////////////////////////////////////////////////////// 30 Vec2faVec2fa31 __forceinline Vec2fa( ) {} Vec2faVec2fa32 __forceinline Vec2fa( const __m128 a ) : m128(a) {} 33 Vec2faVec2fa34 __forceinline Vec2fa ( const Vec2<float>& other ) { x = other.x; y = other.y; } 35 __forceinline Vec2fa& operator =( const Vec2<float>& other ) { x = other.x; y = other.y; return *this; } 36 Vec2faVec2fa37 __forceinline Vec2fa ( const Vec2fa& other ) { m128 = other.m128; } 38 __forceinline Vec2fa& operator =( const Vec2fa& other ) { m128 = other.m128; return *this; } 39 Vec2faVec2fa40 __forceinline explicit Vec2fa( const float a ) : m128(_mm_set1_ps(a)) {} Vec2faVec2fa41 __forceinline Vec2fa( const float x, const float y) : m128(_mm_set_ps(y, y, y, x)) {} 42 Vec2faVec2fa43 __forceinline explicit Vec2fa( const __m128i a ) : m128(_mm_cvtepi32_ps(a)) {} 44 45 __forceinline operator const __m128&() const { return m128; } 46 __forceinline operator __m128&() { return m128; } 47 48 //////////////////////////////////////////////////////////////////////////////// 49 /// Loads and Stores 50 //////////////////////////////////////////////////////////////////////////////// 51 loadVec2fa52 static __forceinline Vec2fa load( const void* const a ) { 53 return Vec2fa(_mm_and_ps(_mm_load_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); 54 } 55 loaduVec2fa56 static __forceinline Vec2fa loadu( const void* const a ) { 57 return Vec2fa(_mm_and_ps(_mm_loadu_ps((float*)a),_mm_castsi128_ps(_mm_set_epi32(0, 0, -1, -1)))); 58 } 59 storeuVec2fa60 static __forceinline void storeu ( void* ptr, const Vec2fa& v ) { 61 _mm_storeu_ps((float*)ptr,v); 62 } 63 64 //////////////////////////////////////////////////////////////////////////////// 65 /// Constants 66 //////////////////////////////////////////////////////////////////////////////// 67 Vec2faVec2fa68 __forceinline Vec2fa( ZeroTy ) : m128(_mm_setzero_ps()) {} Vec2faVec2fa69 __forceinline Vec2fa( OneTy ) : m128(_mm_set1_ps(1.0f)) {} Vec2faVec2fa70 __forceinline Vec2fa( PosInfTy ) : m128(_mm_set1_ps(pos_inf)) {} Vec2faVec2fa71 __forceinline Vec2fa( NegInfTy ) : m128(_mm_set1_ps(neg_inf)) {} 72 73 //////////////////////////////////////////////////////////////////////////////// 74 /// Array Access 75 //////////////////////////////////////////////////////////////////////////////// 76 77 __forceinline const float& operator []( const size_t index ) const { assert(index < 2); return (&x)[index]; } 78 __forceinline float& operator []( const size_t index ) { assert(index < 2); return (&x)[index]; } 79 }; 80 81 //////////////////////////////////////////////////////////////////////////////// 82 /// Unary Operators 83 //////////////////////////////////////////////////////////////////////////////// 84 85 __forceinline Vec2fa operator +( const Vec2fa& a ) { return a; } 86 __forceinline Vec2fa operator -( const Vec2fa& a ) { 87 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x80000000)); 88 return _mm_xor_ps(a.m128, mask); 89 } abs(const Vec2fa & a)90 __forceinline Vec2fa abs ( const Vec2fa& a ) { 91 const __m128 mask = _mm_castsi128_ps(_mm_set1_epi32(0x7fffffff)); 92 return _mm_and_ps(a.m128, mask); 93 } sign(const Vec2fa & a)94 __forceinline Vec2fa sign ( const Vec2fa& a ) { 95 return blendv_ps(Vec2fa(one), -Vec2fa(one), _mm_cmplt_ps (a,Vec2fa(zero))); 96 } 97 rcp(const Vec2fa & a)98 __forceinline Vec2fa rcp ( const Vec2fa& a ) 99 { 100 #if defined(__AVX512VL__) 101 const Vec2fa r = _mm_rcp14_ps(a.m128); 102 #else 103 const Vec2fa r = _mm_rcp_ps(a.m128); 104 #endif 105 106 #if defined(__AVX2__) 107 const Vec2fa h_n = _mm_fnmadd_ps(a, r, vfloat4(1.0)); // First, compute 1 - a * r (which will be very close to 0) 108 const Vec2fa res = _mm_fmadd_ps(r, h_n, r); // Then compute r + r * h_n 109 #else 110 const Vec2fa h_n = _mm_sub_ps(vfloat4(1.0f), _mm_mul_ps(a, r)); // First, compute 1 - a * r (which will be very close to 0) 111 const Vec2fa res = _mm_add_ps(r,_mm_mul_ps(r, h_n)); // Then compute r + r * h_n 112 #endif 113 114 return res; 115 } 116 sqrt(const Vec2fa & a)117 __forceinline Vec2fa sqrt ( const Vec2fa& a ) { return _mm_sqrt_ps(a.m128); } sqr(const Vec2fa & a)118 __forceinline Vec2fa sqr ( const Vec2fa& a ) { return _mm_mul_ps(a,a); } 119 rsqrt(const Vec2fa & a)120 __forceinline Vec2fa rsqrt( const Vec2fa& a ) 121 { 122 #if defined(__AVX512VL__) 123 __m128 r = _mm_rsqrt14_ps(a.m128); 124 #else 125 __m128 r = _mm_rsqrt_ps(a.m128); 126 #endif 127 return _mm_add_ps(_mm_mul_ps(_mm_set1_ps(1.5f),r), _mm_mul_ps(_mm_mul_ps(_mm_mul_ps(a, _mm_set1_ps(-0.5f)), r), _mm_mul_ps(r, r))); 128 } 129 zero_fix(const Vec2fa & a)130 __forceinline Vec2fa zero_fix(const Vec2fa& a) { 131 return blendv_ps(a, _mm_set1_ps(min_rcp_input), _mm_cmplt_ps (abs(a).m128, _mm_set1_ps(min_rcp_input))); 132 } rcp_safe(const Vec2fa & a)133 __forceinline Vec2fa rcp_safe(const Vec2fa& a) { 134 return rcp(zero_fix(a)); 135 } log(const Vec2fa & a)136 __forceinline Vec2fa log ( const Vec2fa& a ) { 137 return Vec2fa(logf(a.x),logf(a.y)); 138 } 139 exp(const Vec2fa & a)140 __forceinline Vec2fa exp ( const Vec2fa& a ) { 141 return Vec2fa(expf(a.x),expf(a.y)); 142 } 143 144 //////////////////////////////////////////////////////////////////////////////// 145 /// Binary Operators 146 //////////////////////////////////////////////////////////////////////////////// 147 148 __forceinline Vec2fa operator +( const Vec2fa& a, const Vec2fa& b ) { return _mm_add_ps(a.m128, b.m128); } 149 __forceinline Vec2fa operator -( const Vec2fa& a, const Vec2fa& b ) { return _mm_sub_ps(a.m128, b.m128); } 150 __forceinline Vec2fa operator *( const Vec2fa& a, const Vec2fa& b ) { return _mm_mul_ps(a.m128, b.m128); } 151 __forceinline Vec2fa operator *( const Vec2fa& a, const float b ) { return a * Vec2fa(b); } 152 __forceinline Vec2fa operator *( const float a, const Vec2fa& b ) { return Vec2fa(a) * b; } 153 __forceinline Vec2fa operator /( const Vec2fa& a, const Vec2fa& b ) { return _mm_div_ps(a.m128,b.m128); } 154 __forceinline Vec2fa operator /( const Vec2fa& a, const float b ) { return _mm_div_ps(a.m128,_mm_set1_ps(b)); } 155 __forceinline Vec2fa operator /( const float a, const Vec2fa& b ) { return _mm_div_ps(_mm_set1_ps(a),b.m128); } 156 min(const Vec2fa & a,const Vec2fa & b)157 __forceinline Vec2fa min( const Vec2fa& a, const Vec2fa& b ) { return _mm_min_ps(a.m128,b.m128); } max(const Vec2fa & a,const Vec2fa & b)158 __forceinline Vec2fa max( const Vec2fa& a, const Vec2fa& b ) { return _mm_max_ps(a.m128,b.m128); } 159 160 #if defined(__SSE4_1__) mini(const Vec2fa & a,const Vec2fa & b)161 __forceinline Vec2fa mini(const Vec2fa& a, const Vec2fa& b) { 162 const vint4 ai = _mm_castps_si128(a); 163 const vint4 bi = _mm_castps_si128(b); 164 const vint4 ci = _mm_min_epi32(ai,bi); 165 return _mm_castsi128_ps(ci); 166 } 167 #endif 168 169 #if defined(__SSE4_1__) maxi(const Vec2fa & a,const Vec2fa & b)170 __forceinline Vec2fa maxi(const Vec2fa& a, const Vec2fa& b) { 171 const vint4 ai = _mm_castps_si128(a); 172 const vint4 bi = _mm_castps_si128(b); 173 const vint4 ci = _mm_max_epi32(ai,bi); 174 return _mm_castsi128_ps(ci); 175 } 176 #endif 177 pow(const Vec2fa & a,const float & b)178 __forceinline Vec2fa pow ( const Vec2fa& a, const float& b ) { 179 return Vec2fa(powf(a.x,b),powf(a.y,b)); 180 } 181 182 //////////////////////////////////////////////////////////////////////////////// 183 /// Ternary Operators 184 //////////////////////////////////////////////////////////////////////////////// 185 186 #if defined(__AVX2__) madd(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)187 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmadd_ps(a,b,c); } msub(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)188 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fmsub_ps(a,b,c); } nmadd(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)189 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmadd_ps(a,b,c); } nmsub(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)190 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return _mm_fnmsub_ps(a,b,c); } 191 #else madd(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)192 __forceinline Vec2fa madd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b+c; } msub(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)193 __forceinline Vec2fa msub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return a*b-c; } nmadd(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)194 __forceinline Vec2fa nmadd ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b+c;} nmsub(const Vec2fa & a,const Vec2fa & b,const Vec2fa & c)195 __forceinline Vec2fa nmsub ( const Vec2fa& a, const Vec2fa& b, const Vec2fa& c) { return -a*b-c; } 196 #endif 197 madd(const float a,const Vec2fa & b,const Vec2fa & c)198 __forceinline Vec2fa madd ( const float a, const Vec2fa& b, const Vec2fa& c) { return madd(Vec2fa(a),b,c); } msub(const float a,const Vec2fa & b,const Vec2fa & c)199 __forceinline Vec2fa msub ( const float a, const Vec2fa& b, const Vec2fa& c) { return msub(Vec2fa(a),b,c); } nmadd(const float a,const Vec2fa & b,const Vec2fa & c)200 __forceinline Vec2fa nmadd ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmadd(Vec2fa(a),b,c); } nmsub(const float a,const Vec2fa & b,const Vec2fa & c)201 __forceinline Vec2fa nmsub ( const float a, const Vec2fa& b, const Vec2fa& c) { return nmsub(Vec2fa(a),b,c); } 202 203 //////////////////////////////////////////////////////////////////////////////// 204 /// Assignment Operators 205 //////////////////////////////////////////////////////////////////////////////// 206 207 __forceinline Vec2fa& operator +=( Vec2fa& a, const Vec2fa& b ) { return a = a + b; } 208 __forceinline Vec2fa& operator -=( Vec2fa& a, const Vec2fa& b ) { return a = a - b; } 209 __forceinline Vec2fa& operator *=( Vec2fa& a, const Vec2fa& b ) { return a = a * b; } 210 __forceinline Vec2fa& operator *=( Vec2fa& a, const float b ) { return a = a * b; } 211 __forceinline Vec2fa& operator /=( Vec2fa& a, const Vec2fa& b ) { return a = a / b; } 212 __forceinline Vec2fa& operator /=( Vec2fa& a, const float b ) { return a = a / b; } 213 214 //////////////////////////////////////////////////////////////////////////////// 215 /// Reductions 216 //////////////////////////////////////////////////////////////////////////////// 217 reduce_add(const Vec2fa & v)218 __forceinline float reduce_add(const Vec2fa& v) { return v.x+v.y; } reduce_mul(const Vec2fa & v)219 __forceinline float reduce_mul(const Vec2fa& v) { return v.x*v.y; } reduce_min(const Vec2fa & v)220 __forceinline float reduce_min(const Vec2fa& v) { return min(v.x,v.y); } reduce_max(const Vec2fa & v)221 __forceinline float reduce_max(const Vec2fa& v) { return max(v.x,v.y); } 222 223 //////////////////////////////////////////////////////////////////////////////// 224 /// Comparison Operators 225 //////////////////////////////////////////////////////////////////////////////// 226 227 __forceinline bool operator ==( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpeq_ps (a.m128, b.m128)) & 3) == 3; } 228 __forceinline bool operator !=( const Vec2fa& a, const Vec2fa& b ) { return (_mm_movemask_ps(_mm_cmpneq_ps(a.m128, b.m128)) & 3) != 0; } 229 230 //////////////////////////////////////////////////////////////////////////////// 231 /// Euclidian Space Operators 232 //////////////////////////////////////////////////////////////////////////////// 233 234 #if defined(__SSE4_1__) dot(const Vec2fa & a,const Vec2fa & b)235 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { 236 return _mm_cvtss_f32(_mm_dp_ps(a,b,0x3F)); 237 } 238 #else dot(const Vec2fa & a,const Vec2fa & b)239 __forceinline float dot ( const Vec2fa& a, const Vec2fa& b ) { 240 return reduce_add(a*b); 241 } 242 #endif 243 cross(const Vec2fa & a)244 __forceinline Vec2fa cross ( const Vec2fa& a ) { 245 return Vec2fa(-a.y,a.x); 246 } 247 sqr_length(const Vec2fa & a)248 __forceinline float sqr_length ( const Vec2fa& a ) { return dot(a,a); } rcp_length(const Vec2fa & a)249 __forceinline float rcp_length ( const Vec2fa& a ) { return rsqrt(dot(a,a)); } rcp_length2(const Vec2fa & a)250 __forceinline float rcp_length2( const Vec2fa& a ) { return rcp(dot(a,a)); } length(const Vec2fa & a)251 __forceinline float length ( const Vec2fa& a ) { return sqrt(dot(a,a)); } normalize(const Vec2fa & a)252 __forceinline Vec2fa normalize( const Vec2fa& a ) { return a*rsqrt(dot(a,a)); } distance(const Vec2fa & a,const Vec2fa & b)253 __forceinline float distance ( const Vec2fa& a, const Vec2fa& b ) { return length(a-b); } 254 255 //////////////////////////////////////////////////////////////////////////////// 256 /// Select 257 //////////////////////////////////////////////////////////////////////////////// 258 select(bool s,const Vec2fa & t,const Vec2fa & f)259 __forceinline Vec2fa select( bool s, const Vec2fa& t, const Vec2fa& f ) { 260 __m128 mask = s ? _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_setzero_si128(), _mm_setzero_si128())) : _mm_setzero_ps(); 261 return blendv_ps(f, t, mask); 262 } 263 lerp(const Vec2fa & v0,const Vec2fa & v1,const float t)264 __forceinline Vec2fa lerp(const Vec2fa& v0, const Vec2fa& v1, const float t) { 265 return madd(1.0f-t,v0,t*v1); 266 } 267 maxDim(const Vec2fa & a)268 __forceinline int maxDim ( const Vec2fa& a ) 269 { 270 const Vec2fa b = abs(a); 271 if (b.x > b.y) return 0; 272 else return 1; 273 } 274 275 //////////////////////////////////////////////////////////////////////////////// 276 /// Rounding Functions 277 //////////////////////////////////////////////////////////////////////////////// 278 279 #if defined(__aarch64__) 280 //__forceinline Vec2fa trunc(const Vec2fa& a) { return vrndq_f32(a); } floor(const Vec2fa & a)281 __forceinline Vec2fa floor(const Vec2fa& a) { return vrndmq_f32(a); } ceil(const Vec2fa & a)282 __forceinline Vec2fa ceil (const Vec2fa& a) { return vrndpq_f32(a); } 283 #elif defined (__SSE4_1__) 284 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT); } floor(const Vec2fa & a)285 __forceinline Vec2fa floor( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF ); } ceil(const Vec2fa & a)286 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF ); } 287 #else 288 //__forceinline Vec2fa trunc( const Vec2fa& a ) { return Vec2fa(truncf(a.x),truncf(a.y),truncf(a.z)); } floor(const Vec2fa & a)289 __forceinline Vec2fa floor( const Vec2fa& a ) { return Vec2fa(floorf(a.x),floorf(a.y)); } ceil(const Vec2fa & a)290 __forceinline Vec2fa ceil ( const Vec2fa& a ) { return Vec2fa(ceilf (a.x),ceilf (a.y)); } 291 #endif 292 293 //////////////////////////////////////////////////////////////////////////////// 294 /// Output Operators 295 //////////////////////////////////////////////////////////////////////////////// 296 297 __forceinline embree_ostream operator<<(embree_ostream cout, const Vec2fa& a) { 298 return cout << "(" << a.x << ", " << a.y << ")"; 299 } 300 301 typedef Vec2fa Vec2fa_t; 302 } 303