1 #ifndef NETGEN_CORE_SIMD_SSE_HPP
2 #define NETGEN_CORE_SIMD_SSE_HPP
3 
4 /**************************************************************************/
5 /* File:   simd_sse.hpp                                                   */
6 /* Author: Joachim Schoeberl, Matthias Hochsteger                         */
7 /* Date:   25. Mar. 16                                                    */
8 /**************************************************************************/
9 
10 #include <immintrin.h>
11 
12 namespace ngcore
13 {
14 
15   template <>
16   class SIMD<mask64,2>
17   {
18     __m128i mask;
19   public:
SIMD(int i)20     SIMD (int i)
21       : mask(_mm_cmpgt_epi32(_mm_set1_epi32(i),
22                              _mm_set_epi32(1, 1, 0, 0)))
23     { ; }
SIMD(__m128i _mask)24     SIMD (__m128i _mask) : mask(_mask) { ; }
Data() const25     __m128i Data() const { return mask; }
Size()26     static constexpr int Size() { return 2; }
27     static NETGEN_INLINE SIMD<mask64, 2> GetMaskFromBits (unsigned int i);
28   };
29 
30   static SIMD<mask64, 2> masks_from_2bits[4] = {
31     _mm_set_epi32 (0,0,0,0), _mm_set_epi32 (0,0,-1,0),
32     _mm_set_epi32 (-1,0,0,0), _mm_set_epi32 (-1,0,-1,0),
33   };
34 
GetMaskFromBits(unsigned int i)35   NETGEN_INLINE SIMD<mask64, 2> SIMD<mask64, 2> :: GetMaskFromBits (unsigned int i)
36   {
37     return masks_from_2bits[i & 3];
38   }
39 
40 
41   template<>
42   class alignas(16) SIMD<int64_t,2>
43   {
44     __m128i data;
45 
46   public:
Size()47     static constexpr int Size() { return 2; }
SIMD()48     SIMD () {}
49     SIMD (const SIMD &) = default;
SIMD(int64_t v0,int64_t v1)50     SIMD (int64_t v0, int64_t v1) { data = _mm_set_epi64x(v1,v0); }
SIMD(std::array<int64_t,2> arr)51     SIMD (std::array<int64_t, 2> arr)
52         : data{_mm_set_epi64x(arr[1],arr[0])}
53     {}
54 
55     SIMD & operator= (const SIMD &) = default;
56 
SIMD(int64_t val)57     SIMD (int64_t val) { data = _mm_set1_epi64x(val); }
SIMD(__m128i _data)58     SIMD (__m128i _data) { data = _data; }
59 
60     template<typename T, typename std::enable_if<std::is_convertible<T, std::function<int64_t(int)>>::value, int>::type = 0>
SIMD(const T & func)61     SIMD (const T & func)
62     {
63       data = _mm_set_epi64(func(1), func(0));
64     }
65 
operator [](int i) const66     NETGEN_INLINE auto operator[] (int i) const { return ((int64_t*)(&data))[i]; }
Data() const67     NETGEN_INLINE __m128i Data() const { return data; }
Data()68     NETGEN_INLINE __m128i & Data() { return data; }
FirstInt(int n0=0)69     static SIMD FirstInt(int n0=0) { return { n0, n0+1 }; }
70   };
71 
72 
73 
operator -(SIMD<int64_t,2> a)74 NETGEN_INLINE SIMD<int64_t,2> operator-(SIMD<int64_t,2> a) { return _mm_sub_epi64(_mm_setzero_si128(), a.Data()); }
operator +(SIMD<int64_t,2> a,SIMD<int64_t,2> b)75 NETGEN_INLINE SIMD<int64_t,2> operator+ (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_add_epi64(a.Data(),b.Data()); }
operator -(SIMD<int64_t,2> a,SIMD<int64_t,2> b)76 NETGEN_INLINE SIMD<int64_t,2> operator- (SIMD<int64_t,2> a, SIMD<int64_t,2> b) { return _mm_sub_epi64(a.Data(),b.Data()); }
77 
78 
79   template<>
80   class alignas(16) SIMD<double,2>
81   {
82     __m128d data;
83 
84   public:
Size()85     static constexpr int Size() { return 2; }
SIMD()86     SIMD () {}
87     SIMD (const SIMD &) = default;
SIMD(double v0,double v1)88     SIMD (double v0, double v1) { data = _mm_set_pd(v1,v0); }
SIMD(std::array<double,2> arr)89     SIMD (std::array<double, 2> arr)
90         : data{_mm_set_pd(arr[1], arr[0])}
91     {}
92 
93     SIMD & operator= (const SIMD &) = default;
94 
SIMD(double val)95     SIMD (double val) { data = _mm_set1_pd(val); }
SIMD(int val)96     SIMD (int val)    { data = _mm_set1_pd(val); }
SIMD(size_t val)97     SIMD (size_t val) { data = _mm_set1_pd(val); }
98 
SIMD(double const * p)99     SIMD (double const * p) { data = _mm_loadu_pd(p); }
SIMD(double const * p,SIMD<mask64,2> mask)100     SIMD (double const * p, SIMD<mask64,2> mask)
101       {
102 #ifdef __AVX__
103         data = _mm_maskload_pd(p, mask.Data());
104 #else
105         // this versions segfaults if p points to the last allowed element
106         // happened on Mac with the new SparseCholesky-factorization
107         // data = _mm_and_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p));
108         auto pmask = (int64_t*)&mask;
109         data = _mm_set_pd (pmask[1] ? p[1] : 0.0, pmask[0] ? p[0] : 0.0);
110 #endif
111       }
SIMD(__m128d _data)112     SIMD (__m128d _data) { data = _data; }
113 
Store(double * p)114     void Store (double * p) { _mm_storeu_pd(p, data); }
Store(double * p,SIMD<mask64,2> mask)115     void Store (double * p, SIMD<mask64,2> mask)
116     {
117 #ifdef __AVX__
118       _mm_maskstore_pd(p, mask.Data(), data);
119 #else
120       /*
121       _mm_storeu_pd (p, _mm_or_pd (_mm_and_pd(_mm_castsi128_pd(mask.Data()), data),
122                                    _mm_andnot_pd(_mm_castsi128_pd(mask.Data()), _mm_loadu_pd(p))));
123       */
124       auto pmask = (int64_t*)&mask;
125       if (pmask[0]) p[0] = (*this)[0];
126       if (pmask[1]) p[1] = (*this)[1];
127 #endif
128     }
129 
130     template<typename T, typename std::enable_if<std::is_convertible<T, std::function<double(int)>>::value, int>::type = 0>
SIMD(const T & func)131     SIMD (const T & func)
132     {
133       data = _mm_set_pd(func(1), func(0));
134     }
135 
operator [](int i) const136     NETGEN_INLINE double operator[] (int i) const { return ((double*)(&data))[i]; }
Data() const137     NETGEN_INLINE __m128d Data() const { return data; }
Data()138     NETGEN_INLINE __m128d & Data() { return data; }
139 
operator std::tuple<double&,double&>()140     operator std::tuple<double&,double&> ()
141     {
142       auto pdata = (double*)&data;
143       return std::tuple<double&,double&>(pdata[0], pdata[1]);
144     }
145   };
146 
operator -(SIMD<double,2> a)147   NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a) { return _mm_xor_pd(a.Data(), _mm_set1_pd(-0.0)); }
operator +(SIMD<double,2> a,SIMD<double,2> b)148   NETGEN_INLINE SIMD<double,2> operator+ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_add_pd(a.Data(),b.Data()); }
operator -(SIMD<double,2> a,SIMD<double,2> b)149   NETGEN_INLINE SIMD<double,2> operator- (SIMD<double,2> a, SIMD<double,2> b) { return _mm_sub_pd(a.Data(),b.Data()); }
operator *(SIMD<double,2> a,SIMD<double,2> b)150   NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> a, SIMD<double,2> b) { return _mm_mul_pd(a.Data(),b.Data()); }
operator /(SIMD<double,2> a,SIMD<double,2> b)151   NETGEN_INLINE SIMD<double,2> operator/ (SIMD<double,2> a, SIMD<double,2> b) { return _mm_div_pd(a.Data(),b.Data()); }
operator *(double a,SIMD<double,2> b)152   NETGEN_INLINE SIMD<double,2> operator* (double a, SIMD<double,2> b) { return _mm_set1_pd(a)*b; }
operator *(SIMD<double,2> b,double a)153   NETGEN_INLINE SIMD<double,2> operator* (SIMD<double,2> b, double a) { return _mm_set1_pd(a)*b; }
154 
155   template<>
Unpack(SIMD<double,2> a,SIMD<double,2> b)156   NETGEN_INLINE auto Unpack (SIMD<double,2> a, SIMD<double,2> b)
157   {
158     return std::make_tuple(SIMD<double,2>(_mm_unpacklo_pd(a.Data(),b.Data())),
159                       SIMD<double,2>(_mm_unpackhi_pd(a.Data(),b.Data())));
160   }
161 
my_mm_hadd_pd(__m128d a,__m128d b)162   NETGEN_INLINE __m128d my_mm_hadd_pd(__m128d a, __m128d b) {
163 #if defined(__SSE3__) || defined(__AVX__)
164     return _mm_hadd_pd(a,b);
165 #else
166     return _mm_add_pd( _mm_unpacklo_pd(a,b), _mm_unpackhi_pd(a,b) );
167 #endif
168   }
169 
170 #ifndef __AVX__
my_mm_cmpgt_epi64(__m128i a,__m128i b)171   NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
172     auto  res_lo = _mm_cvtsi128_si64(a)  > _mm_cvtsi128_si64(b) ? -1:0;
173     auto  res_hi = _mm_cvtsi128_si64(_mm_srli_si128(a,8)) > _mm_cvtsi128_si64(_mm_srli_si128(b,8)) ? -1 : 0;
174     return _mm_set_epi64x(res_hi,res_lo);
175   }
176 #else
my_mm_cmpgt_epi64(__m128i a,__m128i b)177   NETGEN_INLINE __m128i my_mm_cmpgt_epi64(__m128i a, __m128i b) {
178     return _mm_cmpgt_epi64(a,b);
179   }
180 #endif
181 
182 
sqrt(SIMD<double,2> a)183   NETGEN_INLINE SIMD<double,2> sqrt (SIMD<double,2> a) { return _mm_sqrt_pd(a.Data()); }
fabs(SIMD<double,2> a)184   NETGEN_INLINE SIMD<double,2> fabs (SIMD<double,2> a) { return _mm_max_pd(a.Data(), (-a).Data()); }
185   using std::floor;
floor(SIMD<double,2> a)186   NETGEN_INLINE SIMD<double,2> floor (SIMD<double,2> a)
187   { return ngcore::SIMD<double,2>([&](int i)->double { return floor(a[i]); } ); }
188   using std::ceil;
ceil(SIMD<double,2> a)189   NETGEN_INLINE SIMD<double,2> ceil (SIMD<double,2> a)
190   { return ngcore::SIMD<double,2>([&](int i)->double { return ceil(a[i]); } ); }
191 
operator <=(SIMD<double,2> a,SIMD<double,2> b)192   NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<double,2> a , SIMD<double,2> b)
193   { return _mm_castpd_si128( _mm_cmple_pd(a.Data(),b.Data())); }
operator <(SIMD<double,2> a,SIMD<double,2> b)194   NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<double,2> a , SIMD<double,2> b)
195   { return _mm_castpd_si128( _mm_cmplt_pd(a.Data(),b.Data())); }
operator >=(SIMD<double,2> a,SIMD<double,2> b)196   NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<double,2> a , SIMD<double,2> b)
197   { return _mm_castpd_si128( _mm_cmpge_pd(a.Data(),b.Data())); }
operator >(SIMD<double,2> a,SIMD<double,2> b)198   NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<double,2> a , SIMD<double,2> b)
199   { return _mm_castpd_si128( _mm_cmpgt_pd(a.Data(),b.Data())); }
operator ==(SIMD<double,2> a,SIMD<double,2> b)200   NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<double,2> a , SIMD<double,2> b)
201   { return _mm_castpd_si128( _mm_cmpeq_pd(a.Data(),b.Data())); }
operator !=(SIMD<double,2> a,SIMD<double,2> b)202   NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<double,2> a , SIMD<double,2> b)
203   { return _mm_castpd_si128( _mm_cmpneq_pd(a.Data(),b.Data())); }
204 
operator <=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)205   NETGEN_INLINE SIMD<mask64,2> operator<= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
206   { return  _mm_xor_si128(_mm_cmpgt_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
operator <(SIMD<int64_t,2> a,SIMD<int64_t,2> b)207   NETGEN_INLINE SIMD<mask64,2> operator< (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
208   { return  my_mm_cmpgt_epi64(b.Data(),a.Data()); }
operator >=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)209   NETGEN_INLINE SIMD<mask64,2> operator>= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
210   { return  _mm_xor_si128(_mm_cmpgt_epi64(b.Data(),a.Data()),_mm_set1_epi32(-1)); }
operator >(SIMD<int64_t,2> a,SIMD<int64_t,2> b)211   NETGEN_INLINE SIMD<mask64,2> operator> (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
212   { return  my_mm_cmpgt_epi64(a.Data(),b.Data()); }
operator ==(SIMD<int64_t,2> a,SIMD<int64_t,2> b)213   NETGEN_INLINE SIMD<mask64,2> operator== (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
214   { return  _mm_cmpeq_epi64(a.Data(),b.Data()); }
operator !=(SIMD<int64_t,2> a,SIMD<int64_t,2> b)215   NETGEN_INLINE SIMD<mask64,2> operator!= (SIMD<int64_t,2> a , SIMD<int64_t,2> b)
216   { return  _mm_xor_si128(_mm_cmpeq_epi64(a.Data(),b.Data()),_mm_set1_epi32(-1)); }
217 
218 
219 
operator &&(SIMD<mask64,2> a,SIMD<mask64,2> b)220  NETGEN_INLINE SIMD<mask64,2> operator&& (SIMD<mask64,2> a, SIMD<mask64,2> b)
221   { return _mm_castpd_si128(_mm_and_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( b.Data()))); }
operator ||(SIMD<mask64,2> a,SIMD<mask64,2> b)222   NETGEN_INLINE SIMD<mask64,2> operator|| (SIMD<mask64,2> a, SIMD<mask64,2> b)
223   { return _mm_castpd_si128(_mm_or_pd (_mm_castsi128_pd(a.Data()), _mm_castsi128_pd(b.Data()))); }
operator !(SIMD<mask64,2> a)224   NETGEN_INLINE SIMD<mask64,2> operator! (SIMD<mask64,2> a)
225   { return _mm_castpd_si128(_mm_xor_pd (_mm_castsi128_pd(a.Data()),_mm_castsi128_pd( _mm_cmpeq_epi64(a.Data(),a.Data())))); }
226 #ifdef __SSE4_1__
If(SIMD<mask64,2> a,SIMD<double,2> b,SIMD<double,2> c)227   NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
228   { return _mm_blendv_pd(c.Data(), b.Data(), _mm_castsi128_pd(a.Data())); }
229 #else
If(SIMD<mask64,2> a,SIMD<double,2> b,SIMD<double,2> c)230   NETGEN_INLINE SIMD<double,2> If (SIMD<mask64,2> a, SIMD<double,2> b, SIMD<double,2> c)
231   {
232     return _mm_or_pd(
233                       _mm_andnot_pd(_mm_castsi128_pd(a.Data()),c.Data()),
234                       _mm_and_pd(b.Data(),_mm_castsi128_pd(a.Data()))
235                       );}
236 #endif // __SSE4_1__
237 
IfPos(SIMD<double,2> a,SIMD<double,2> b,SIMD<double,2> c)238   NETGEN_INLINE SIMD<double,2> IfPos (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
239   { return ngcore::SIMD<double,2>([&](int i)->double { return a[i]>0 ? b[i] : c[i]; }); }
IfZero(SIMD<double,2> a,SIMD<double,2> b,SIMD<double,2> c)240   NETGEN_INLINE SIMD<double,2> IfZero (SIMD<double,2> a, SIMD<double,2> b, SIMD<double,2> c)
241   { return ngcore::SIMD<double,2>([&](int i)->double { return a[i]==0. ? b[i] : c[i]; }); }
242 
243 
HSum(SIMD<double,2> sd)244   NETGEN_INLINE double HSum (SIMD<double,2> sd)
245   {
246     return _mm_cvtsd_f64 (my_mm_hadd_pd (sd.Data(), sd.Data()));
247   }
248 
HSum(SIMD<double,2> sd1,SIMD<double,2> sd2)249   NETGEN_INLINE auto HSum (SIMD<double,2> sd1, SIMD<double,2> sd2)
250   {
251     __m128d hv2 = my_mm_hadd_pd(sd1.Data(), sd2.Data());
252     return SIMD<double,2> (hv2);
253     // return SIMD<double,2>(_mm_cvtsd_f64 (hv2),  _mm_cvtsd_f64(_mm_shuffle_pd (hv2, hv2, 3)));
254   }
255 
If(SIMD<mask64,2> a,SIMD<int64_t,2> b,SIMD<int64_t,2> c)256   NETGEN_INLINE SIMD<int64_t, 2> If(SIMD<mask64, 2> a, SIMD<int64_t, 2> b,
257                              SIMD<int64_t, 2> c) {
258     return _mm_or_si128(
259                         _mm_andnot_si128(a.Data(),c.Data()),
260                         _mm_and_si128(b.Data(),a.Data())
261                         );
262   }
263 
264 }
265 
266 #endif // NETGEN_CORE_SIMD_SSE_HPP
267