1 #ifndef NETGEN_CORE_SIMD_SSE_HPP
2 #define NETGEN_CORE_SIMD_SSE_HPP
3
4 /**************************************************************************/
5 /* File: simd_sse.hpp */
6 /* Author: Joachim Schoeberl, Matthias Hochsteger */
7 /* Date: 25. Mar. 16 */
8 /**************************************************************************/
9
10 #include <immintrin.h>
11
12 namespace ngcore
13 {
14
15 template <>
16 class SIMD<mask64,2>
17 {
18 __m128i mask;
19 public:
SIMD(int i)20 SIMD (int i)
21 : mask(_mm_cmpgt_epi32(_mm_set1_epi32(i),
22 _mm_set_epi32(1, 1, 0, 0)))
23 { ; }
SIMD(__m128i _mask)24 SIMD (__m128i _mask) : mask(_mask) { ; }
Data() const25 __m128i Data() const { return mask; }
Size()26 static constexpr int Size() { return 2; }
27 static NETGEN_INLINE SIMD<mask64, 2> GetMaskFromBits (unsigned int i);
28 };
29
30 static SIMD<mask64, 2> masks_from_2bits[4] = {
31 _mm_set_epi32 (0,0,0,0), _mm_set_epi32 (0,0,-1,0),
32 _mm_set_epi32 (-1,0,0,0), _mm_set_epi32 (-1,0,-1,0),
33 };
34
GetMaskFromBits(unsigned int i)35 NETGEN_INLINE SIMD<mask64, 2> SIMD<mask64, 2> :: GetMaskFromBits (unsigned int i)
36 {
37 return masks_from_2bits[i & 3];
38 }
39
40
41 template<>
42 class alignas(16) SIMD<int64_t,2>
43 {
44 __m128i data;
45
46 public:
Size()47 static constexpr int Size() { return 2; }
SIMD()48 SIMD () {}
49 SIMD (const SIMD &) = default;
SIMD(int64_t v0,int64_t v1)50 SIMD (int64_t v0, int64_t v1) { data = _mm_set_epi64x(v1,v0); }
SIMD(std::array<int64_t,2> arr)51 SIMD (std::array<int64_t, 2> arr)
52 : data{_mm_set_epi64x(arr[1],arr[0])}
53 {}
54
55 SIMD & operator= (const SIMD &) = default;
56
SIMD(int64_t val)57 SIMD (int64_t val) { data = _mm_set1_epi64x(val); }
SIMD(__m128i _data)58 SIMD (__m128i _data) { data = _data; }
59
60 template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
SIMD(const T & func)61 SIMD (const T & func)
62 {
63 data = _mm_set_epi64(func(1), func(0));
64 }
65
operator [](int i) const66 NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
Data() const67 NETGEN_INLINE __m128i Data() const { return data; }
Data()68 NETGEN_INLINE __m128i & Data() { return data; }
FirstInt(int n0=0)69 static SIMD FirstInt(int n0=0) { return { n0, n0+1 }; }
70 };
71
72
73
operator -(SIMD<int64_t,2> a)74 NETGEN_INLINE SIMD<int64_t,2> operator-(SIMD<int64_t,2> a) { return _mm_sub_epi64(_mm_setzero_si128(), a.Data()); }
operator +(SIMD<int64_t,2> a,SIMD<int64_t,2> b)75 NETGEN_INLINE SIMD<int64_t,2> operator+ (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_add_epi64(a.Data(),b.Data()); }
operator -(SIMD<int64_t,2> a,SIMD<int64_t,2> b)76 NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_sub_epi64(a.Data(),b.Data()); }
77
78
79 template<>
80 class alignas(16) SIMD<double,2>
81 {
82 __m128d data;
83
84 public:
Size()85 static constexpr int Size() { return 2; }
SIMD()86 SIMD () {}
87 SIMD (const SIMD &) = default;
SIMD(double v0,double v1)88 SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
SIMD(std::array<double,2> arr)89 SIMD (std::array<double, 2> arr)
90 : data{_mm_set_pd(arr[1], arr[0])}
91 {}
92
93 SIMD & operator= (const SIMD &) = default;
94
SIMD(double val)95 SIMD (double val) { data = _mm_set1_pd(val); }
SIMD(int val)96 SIMD (int val) { data = _mm_set1_pd(val); }
SIMD(size_t val)97 SIMD (size_t val) { data = _mm_set1_pd(val); }
98
SIMD(double const * p)99 SIMD (double const * p) { data = _mm_loadu_pd(p); }
SIMD(double const * p,SIMD<mask64,2> mask)100 SIMD (double const * p, SIMD<mask64,2> mask)
101 {
102 #ifdef __AVX__
103 data = _mm_maskload_pd(p, mask.Data());
104 #else
105 // this versions segfaults if p points to the last allowed element
106 // happened on Mac with the new SparseCholesky-factorization
107 // data = _mm_and_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p));
108 auto pmask = (int64_t*)&mask;
109 data = _mm_set_pd (pmask[1] ? p[1] : 0.0, pmask[0] ? p[0] : 0.0);
110 #endif
111 }
SIMD(__m128d _data)112 SIMD (__m128d _data) { data = _data; }
113
Store(double * p)114 void Store (double * p) { _mm_storeu_pd(p, data); }
Store(double * p,SIMD<mask64,2> mask)115 void Store (double * p, SIMD<mask64,2> mask)
116 {
117 #ifdef __AVX__
118 _mm_maskstore_pd(p, mask.Data(), data);
119 #else
120 /*
121 _mm_storeu_pd (p, _mm_or_pd (_mm_and_pd(_mm_castsi128_pd(mask.Data()), data),
122 _mm_andnot_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p))));
123 */
124 auto pmask = (int64_t*)&mask;
125 if (pmask[0]) p[0] = (*this)[0];
126 if (pmask[1]) p[1] = (*this)[1];
127 #endif
128 }
129
130 template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD(const T & func)131 SIMD (const T & func)
132 {
133 data = _mm_set_pd(func(1), func(0));
134 }
135
operator [](int i) const136 NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
Data() const137 NETGEN_INLINE __m128d Data() const { return data; }
Data()138 NETGEN_INLINE __m128d & Data() { return data; }
139
operator std::tuple<double&,double&>()140 operator std::tuple<double&,double&> ()
141 {
142 auto pdata = (double*)&data;
143 return std::tuple<double&,double&>(pdata[0], pdata[1]);
144 }
145 };
146
operator -(SIMD<double,2> a)147 NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return _mm_xor_pd(a.Data(), _mm_set1_pd(-0.0)); }
operator +(SIMD<double,2> a,SIMD<double,2> b)148 NETGEN_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_add_pd(a.Data(),b.Data()); }
operator -(SIMD<double,2> a,SIMD<double,2> b)149 NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return _mm_sub_pd(a.Data(),b.Data()); }
operator *(SIMD<double,2> a,SIMD<double,2> b)150 NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return _mm_mul_pd(a.Data(),b.Data()); }
operator /(SIMD<double,2> a,SIMD<double,2> b)151 NETGEN_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_div_pd(a.Data(),b.Data()); }
operator *(double a,SIMD<double,2> b)152 NETGEN_INLINE SIMD<double,2> operator* (double a, SIMD<double,2> b) { return _mm_set1_pd(a)*b; }
operator *(SIMD<double,2> b,double a)153 NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> b, double a) { return _mm_set1_pd(a)*b; }
154
155 template<>
Unpack(SIMD<double,2> a,SIMD<double,2> b)156 NETGEN_INLINE auto Unpack (SIMD<double,2> a, SIMD<double,2> b)
157 {
158 return std::make_tuple(SIMD<double,2>(_mm_unpacklo_pd(a.Data(),b.Data())),
159 SIMD<double,2>(_mm_unpackhi_pd(a.Data(),b.Data())));
160 }
161
my_mm_hadd_pd(__m128d a,__m128d b)162 NETGEN_INLINE __m128d my_mm_hadd_pd(__m128d a, __m128d b) {
163 #if defined(__SSE3__) || defined(__AVX__)
164 return _mm_hadd_pd(a,b);
165 #else
166 return _mm_add_pd( _mm_unpacklo_pd(a,b), _mm_unpackhi_pd(a,b) );
167 #endif
168 }
169
170 #ifndef __AVX__
my_mm_cmpgt_epi64(__m128i a,__m128i b)171 NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
172 auto res_lo = _mm_cvtsi128_si64(a) > _mm_cvtsi128_si64(b) ? -1:0;
173 auto res_hi = _mm_cvtsi128_si64(_mm_srli_si128(a,8)) > _mm_cvtsi128_si64(_mm_srli_si128(b,8)) ? -1 : 0;
174 return _mm_set_epi64x(res_hi,res_lo);
175 }
176 #else
my_mm_cmpgt_epi64(__m128i a,__m128i b)177 NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
178 return _mm_cmpgt_epi64(a,b);
179 }
180 #endif
181
182
sqrt(SIMD<double,2> a)183 NETGEN_INLINE SIMD<double,2> sqrt (SIMD<double,2> a) { return _mm_sqrt_pd(a.Data()); }
fabs(SIMD<double,2> a)184 NETGEN_INLINE SIMD<double,2> fabs (SIMD<double,2> a) { return _mm_max_pd(a.Data(), (-a).Data()); }
185 using std::floor;
floor(SIMD<double,2> a)186 NETGEN_INLINE SIMD<double,2> floor (SIMD<double,2> a)
187 { return ngcore::SIMD<double,2>([&](int i)->double { return floor(a[i]); } ); }
188 using std::ceil;
ceil(SIMD<double,2> a)189 NETGEN_INLINE SIMD<double,2> ceil (SIMD<double,2> a)
190 { return ngcore::SIMD<double,2>([&](int i)->double { return ceil(a[i]); } ); }
191
operator <=(SIMD<double,2> a,SIMD<double,2> b)192 NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<double,2> a , SIMD<double,2> b)
193 { return _mm_castpd_si128( _mm_cmple_pd(a.Data(),b.Data())); }
operator <(SIMD<double,2> a,SIMD<double,2> b)194 NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<double,2> a , SIMD<double,2> b)
195 { return _mm_castpd_si128( _mm_cmplt_pd(a.Data(),b.Data())); }
operator >=(SIMD<double,2> a,SIMD<double,2> b)196 NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<double,2> a , SIMD<double,2> b)
197 { return _mm_castpd_si128( _mm_cmpge_pd(a.Data(),b.Data())); }
operator >(SIMD<double,2> a,SIMD<double,2> b)198 NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<double,2> a , SIMD<double,2> b)
199 { return _mm_castpd_si128( _mm_cmpgt_pd(a.Data(),b.Data())); }
operator ==(SIMD<double,2> a,SIMD<double,2> b)200 NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<double,2> a , SIMD<double,2> b)
201 { return _mm_castpd_si128( _mm_cmpeq_pd(a.Data(),b.Data())); }
operator !=(SIMD<double,2> a,SIMD<double,2> b)202 NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<double,2> a , SIMD<double,2> b)
203 { return _mm_castpd_si128( _mm_cmpneq_pd(a.Data(),b.Data())); }
204
operator <=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)205 NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
206 { return _mm_xor_si128(_mm_cmpgt_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
operator <(SIMD<int64_t,2> a,SIMD<int64_t,2> b)207 NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
208 { return my_mm_cmpgt_epi64(b.Data(),a.Data()); }
operator >=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)209 NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
210 { return _mm_xor_si128(_mm_cmpgt_epi64(b.Data(),a.Data()),_mm_set1_epi32(-1)); }
operator >(SIMD<int64_t,2> a,SIMD<int64_t,2> b)211 NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
212 { return my_mm_cmpgt_epi64(a.Data(),b.Data()); }
operator ==(SIMD<int64_t,2> a,SIMD<int64_t,2> b)213 NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
214 { return _mm_cmpeq_epi64(a.Data(),b.Data()); }
operator !=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)215 NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
216 { return _mm_xor_si128(_mm_cmpeq_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
217
218
219
operator &&(SIMD<mask64,2> a,SIMD<mask64,2> b)220 NETGEN_INLINE SIMD<mask64,2> operator&& (SIMD<mask64,2> a, SIMD<mask64,2> b)
221 { return _mm_castpd_si128(_mm_and_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( b.Data()))); }
operator ||(SIMD<mask64,2> a,SIMD<mask64,2> b)222 NETGEN_INLINE SIMD<mask64,2> operator|| (SIMD<mask64,2> a, SIMD<mask64,2> b)
223 { return _mm_castpd_si128(_mm_or_pd (_mm_castsi128_pd(a.Data()), _mm_castsi128_pd(b.Data()))); }
operator !(SIMD<mask64,2> a)224 NETGEN_INLINE SIMD<mask64,2> operator! (SIMD<mask64,2> a)
225 { return _mm_castpd_si128(_mm_xor_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( _mm_cmpeq_epi64(a.Data(),a.Data())))); }
226 #ifdef __SSE4_1__
If(SIMD<mask64,2> a,SIMD<double,2> b,SIMD<double,2> c)227 NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
228 { return _mm_blendv_pd(c.Data(), b.Data(), _mm_castsi128_pd(a.Data())); }
229 #else
If(SIMD<mask64,2> a,SIMD<double,2> b,SIMD<double,2> c)230 NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
231 {
232 return _mm_or_pd(
233 _mm_andnot_pd(_mm_castsi128_pd(a.Data()),c.Data()),
234 _mm_and_pd(b.Data(),_mm_castsi128_pd(a.Data()))
235 );}
236 #endif // __SSE4_1__
237
IfPos(SIMD<double,2> a,SIMD<double,2> b,SIMD<double,2> c)238 NETGEN_INLINE SIMD<double,2> IfPos (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
239 { return ngcore::SIMD<double,2>([&](int i)->double { return a[i]>0 ? b[i] : c[i]; }); }
IfZero(SIMD<double,2> a,SIMD<double,2> b,SIMD<double,2> c)240 NETGEN_INLINE SIMD<double,2> IfZero (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
241 { return ngcore::SIMD<double,2>([&](int i)->double { return a[i]==0. ? b[i] : c[i]; }); }
242
243
HSum(SIMD<double,2> sd)244 NETGEN_INLINE double HSum (SIMD<double,2> sd)
245 {
246 return _mm_cvtsd_f64 (my_mm_hadd_pd (sd.Data(), sd.Data()));
247 }
248
HSum(SIMD<double,2> sd1,SIMD<double,2> sd2)249 NETGEN_INLINE auto HSum (SIMD<double,2> sd1, SIMD<double,2> sd2)
250 {
251 __m128d hv2 = my_mm_hadd_pd(sd1.Data(), sd2.Data());
252 return SIMD<double,2> (hv2);
253 // return SIMD<double,2>(_mm_cvtsd_f64 (hv2), _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
254 }
255
If(SIMD<mask64,2> a,SIMD<int64_t,2> b,SIMD<int64_t,2> c)256 NETGEN_INLINE SIMD<int64_t, 2> If(SIMD<mask64, 2> a, SIMD<int64_t, 2> b,
257 SIMD<int64_t, 2> c) {
258 return _mm_or_si128(
259 _mm_andnot_si128(a.Data(),c.Data()),
260 _mm_and_si128(b.Data(),a.Data())
261 );
262 }
263
264 }
265
266 #endif // NETGEN_CORE_SIMD_SSE_HPP
267