1 // Copyright (C) 2016-2017 Erik Hofman - erik@ehofman.com
2 //
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Library General Public
5 // License as published by the Free Software Foundation; either
6 // version 2 of the License, or (at your option) any later version.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
11 // Library General Public License for more details.
12 //
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.
16 //
17 
18 #ifndef __SIMD_H__
19 #define __SIMD_H__	1
20 
21 #ifdef HAVE_CONFIG_H
22 # include <simgear/simgear_config.h>
23 #endif
24 
25 #include <cstdint>
26 #include <cstring>
27 #include <cassert>
28 #include <cmath>
29 #include <new>
30 
31 #if defined(_MSC_VER)
32 # include <intrin.h>
33 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
34 # include <x86intrin.h>
35 #elif defined(__GNUC__) && defined(__ARM_NEON__)
36 # include <arm_neon.h>
37 # include <simgear/math/simd_neon.hxx>
38 #endif
39 
40 #include <simgear/math/SGLimits.hxx>
41 #include <simgear/math/SGMisc.hxx>
42 
43 template<typename T, int N> class simd4_t;
44 
45 namespace simd4
46 {
47 
48 template<typename T, int N>
min(simd4_t<T,N> v1,const simd4_t<T,N> & v2)49 inline simd4_t<T,N> min(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
50     for (int i=0; i<N; ++i) {
51         v1[i] = SGMisc<T>::min(v1[i], v2[i]);
52     }
53     return v1;
54 }
55 
56 template<typename T, int N>
max(simd4_t<T,N> v1,const simd4_t<T,N> & v2)57 inline simd4_t<T,N> max(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
58     for (int i=0; i<N; ++i) {
59         v1[i] = SGMisc<T>::max(v1[i], v2[i]);
60     }
61     return v1;
62 }
63 
64 template<typename T, int N>
abs(simd4_t<T,N> v)65 inline simd4_t<T,N> abs(simd4_t<T,N> v) {
66     for (int i=0; i<N; ++i) {
67         v[i] = std::abs(v[i]);
68     }
69     return v;
70 }
71 
72 template<typename T, int N>
magnitude2(const simd4_t<T,N> & vi)73 inline T magnitude2(const simd4_t<T,N>& vi) {
74     simd4_t<T,N> v(vi);
75     T mag2 = 0;
76     v = v*v;
77     for (int i=0; i<N; ++i) {
78         mag2 += v[i];
79     }
80     return mag2;
81 }
82 
83 template<typename T, int N>
interpolate(T tau,const simd4_t<T,N> & v1,const simd4_t<T,N> & v2)84 inline simd4_t<T,N> interpolate(T tau, const simd4_t<T,N>& v1, const simd4_t<T,N>& v2) {
85     return (T(1)-tau)*v1 + tau*v2;
86 }
87 
88 template<typename T, int N>
magnitude(const simd4_t<T,N> & v)89 inline T magnitude(const simd4_t<T,N>& v) {
90     return std::sqrt(magnitude2(v));
91 }
92 
93 template<typename T, int N>
normalize(simd4_t<T,N> & v)94 inline T normalize(simd4_t<T,N>& v) {
95     T mag = simd4::magnitude(v);
96     if (mag) {
97         v /= mag;
98     } else {
99         v = T(0);
100     }
101     return mag;
102 }
103 
104 template<typename T, int N>
dot(const simd4_t<T,N> & v1,const simd4_t<T,N> & v2)105 inline T dot(const simd4_t<T,N>& v1, const simd4_t<T,N>& v2) {
106     simd4_t<T,N> v(v1*v2);
107     T dp = T(0);
108     for (int i=0; i<N; ++i) {
109        dp += v[i];
110     }
111     return dp;
112 }
113 
114 template<typename T>
cross(const simd4_t<T,3> & v1,const simd4_t<T,3> & v2)115 inline simd4_t<T,3> cross(const simd4_t<T,3>& v1, const simd4_t<T,3>& v2)
116 {
117     simd4_t<T,3> d;
118     d[0] = v1[1]*v2[2] - v1[2]*v2[1];
119     d[1] = v1[2]*v2[0] - v1[0]*v2[2];
120     d[2] = v1[0]*v2[1] - v1[1]*v2[0];
121     return d;
122 }
123 
124 } /* namespace simd4 */
125 
126 
127 template<typename T, int N>
128 class simd4_t
129 {
130 private:
131     union {
132        T _v4[4];
133        T vec[N];
134     };
135 
136 public:
simd4_t(void)137     simd4_t(void) {
138         for (int i=0; i<4; ++i) _v4[i] = 0;
139     }
simd4_t(T s)140     simd4_t(T s) {
141         for (int i=0; i<N; ++i) vec[i] = s;
142         for (int i=N; i<4; ++i) _v4[i] = 0;
143     }
simd4_t(T x,T y)144     simd4_t(T x, T y) : simd4_t(x,y,0,0) {}
simd4_t(T x,T y,T z)145     simd4_t(T x, T y, T z) : simd4_t(x,y,z,0) {}
simd4_t(T x,T y,T z,T w)146     simd4_t(T x, T y, T z, T w) {
147         _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w;
148         for (int i=N; i<4; ++i) _v4[i] = 0;
149     }
simd4_t(const T v[N])150     explicit simd4_t(const T v[N]) {
151         std::memcpy(vec, v, sizeof(T[N]));
152         for (int i=N; i<4; ++i) _v4[i] = 0;
153     }
154     template<int M>
simd4_t(const simd4_t<T,M> & v)155     simd4_t(const simd4_t<T,M>& v) {
156         std::memcpy(_v4, v.ptr(), sizeof(T[M]));
157         for (int i=(M<N)?M:N; i<4; ++i) _v4[i] = 0;
158     }
~simd4_t(void)159     ~simd4_t(void) {}
160 
__anon3e7c049c0202null161     inline T (&v4(void))[N] {
162         return vec;
163     }
164 
__anon3e7c049c0302null165     inline const T (&v4(void) const)[N] {
166         return vec;
167     }
168 
__anon3e7c049c0402null169     inline const T (&ptr(void) const)[N] {
170         return vec;
171     }
172 
__anon3e7c049c0502null173     inline T (&ptr(void))[N] {
174         return vec;
175     }
176 
177     inline  const T& operator[](unsigned n) const {
178         assert(n<N);
179         return vec[n];
180     }
181 
operator [](unsigned n)182     inline T& operator[](unsigned n) {
183         assert(n<N);
184         return vec[n];
185     }
186 
187     template<int M>
operator =(const simd4_t<T,M> & v)188     inline simd4_t<T,N>& operator=(const simd4_t<T,M>& v) {
189         *this = simd4_t<T,N>(v);
190         return *this;
191     }
192 
operator +=(T s)193     inline simd4_t<T,N>& operator+=(T s) {
194         for (int i=0; i<N; ++i) {
195             vec[i] += s;
196         }
197         return *this;
198     }
operator +=(const T v[N])199     inline simd4_t<T,N>& operator+=(const T v[N]) {
200         for (int i=0; i<N; ++i) {
201            vec[i] += v[i];
202         }
203         return *this;
204     }
operator +=(const simd4_t<T,N> & v)205     inline simd4_t<T,N>& operator+=(const simd4_t<T,N>& v) {
206         for (int i=0; i<N; ++i) {
207            vec[i] += v[i];
208         }
209         return *this;
210     }
211 
operator -=(T s)212     inline simd4_t<T,N>& operator-=(T s) {
213         for (int i=0; i<N; ++i) {
214            vec[i] -= s;
215         }
216         return *this;
217     }
operator -=(const T v[N])218     inline simd4_t<T,N>& operator-=(const T v[N]) {
219         for (int i=0; i<N; ++i) {
220             vec[i] -= v[i];
221         }
222         return *this;
223     }
operator -=(const simd4_t<T,N> & v)224     inline simd4_t<T,N>& operator-=(const simd4_t<T,N>& v) {
225         for (int i=0; i<N; ++i) {
226             vec[i] -= v[i];
227         }
228         return *this;
229     }
operator *=(T s)230     inline simd4_t<T,N>& operator*=(T s) {
231         for (int i=0; i<N; ++i) {
232            vec[i] *= s;
233         }
234         return *this;
235     }
operator *=(const T v[N])236     inline simd4_t<T,N>& operator*=(const T v[N]) {
237         for (int i=0; i<N; ++i) {
238            vec[i] *= v[i];
239         }
240         return *this;
241     }
operator *=(const simd4_t<T,N> & v)242     inline simd4_t<T,N>& operator*=(const simd4_t<T,N>& v) {
243         for (int i=0; i<N; ++i) {
244            vec[i] *= v[i];
245         }
246         return *this;
247     }
248 
operator /=(T s)249     inline simd4_t<T,N>& operator/=(T s) {
250         for (int i=0; i<N; ++i) {
251            vec[i] /= s;
252         }
253         return *this;
254     }
operator /=(const T v[N])255     inline simd4_t<T,N>& operator/=(const T v[N]) {
256         for (int i=0; i<N; ++i) {
257            vec[i] /= v[i];
258         }
259         return *this;
260     }
operator /=(const simd4_t<T,N> & v)261     inline simd4_t<T,N>& operator/=(const simd4_t<T,N>& v) {
262         for (int i=0; i<N; ++i) {
263            vec[i] /= v[i];
264         }
265         return *this;
266     }
267 };
268 
269 template<typename T, int N>
operator -(const simd4_t<T,N> & v)270 inline simd4_t<T,N> operator-(const simd4_t<T,N>& v) {
271     simd4_t<T,N> r(T(0));
272     r -= v;
273     return r;
274 }
275 
276 template<typename T, int N>
operator +(simd4_t<T,N> v1,const simd4_t<T,N> & v2)277 inline simd4_t<T,N> operator+(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
278     v1 += v2;
279     return v1;
280 }
281 
282 template<typename T, int N>
operator -(simd4_t<T,N> v1,const simd4_t<T,N> & v2)283 inline simd4_t<T,N> operator-(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
284     v1 -= v2;
285     return v1;
286 }
287 
288 template<typename T, int N>
operator *(simd4_t<T,N> v1,const simd4_t<T,N> & v2)289 inline simd4_t<T,N> operator*(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
290     v1 *= v2;
291     return v1;
292 }
293 
294 template<typename T, int N>
operator /(simd4_t<T,N> v1,const simd4_t<T,N> & v2)295 inline simd4_t<T,N> operator/(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
296     v1 /= v2;
297     return v1;
298 }
299 
300 template<typename T, int N>
operator *(T f,simd4_t<T,N> v)301 inline simd4_t<T,N> operator*(T f, simd4_t<T,N> v) {
302     v *= f;
303     return v;
304 }
305 
306 template<typename T, int N>
operator *(simd4_t<T,N> v,T f)307 inline simd4_t<T,N> operator*(simd4_t<T,N> v, T f) {
308     v *= f;
309     return v;
310 }
311 
312 #ifdef ENABLE_SIMD_CODE
313 
314 # ifdef __SSE__
315 namespace simd4
316 {
317 alignas(16) static const uint32_t m2a32[] = {
318     0xffffffff,0xffffffff,0,0
319   };
320 alignas(16) static const uint32_t m3a32[] = {
321     0xffffffff,0xffffffff,0xffffffff,0
322   };
323 alignas(32) static const uint64_t m2a64[] = {
324     0xffffffffffffffff,0xffffffffffffffff,0,0
325   };
326 alignas(32) static const uint64_t m3a64[] = {
327     0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0
328   };
329 }; /* namespace simd4 */
330 
331 
332 template<int N>
333 class alignas(16) simd4_t<float,N>
334 {
335 private:
336    typedef float  __vec4f_t[N];
337 
338     union {
339         __m128 simd4;
340         __vec4f_t vec;
341     };
342 
343 public:
simd4_t(void)344     simd4_t(void) { simd4 = _mm_setzero_ps(); }
simd4_t(float f)345     simd4_t(float f) { simd4 = _mm_set1_ps(f); }
simd4_t(float x,float y)346     simd4_t(float x, float y) : simd4_t(x,y,0,0) {}
simd4_t(float x,float y,float z)347     simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {}
simd4_t(float x,float y,float z,float w)348     simd4_t(float x, float y, float z, float w) { simd4 = _mm_set_ps(w,z,y,x); }
simd4_t(const __vec4f_t v)349     simd4_t(const __vec4f_t v) { simd4 = _mm_loadu_ps(v); }
simd4_t(const simd4_t<float,4> & v)350     simd4_t(const simd4_t<float,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<float,3> & v)351     simd4_t(const simd4_t<float,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<float,2> & v)352     simd4_t(const simd4_t<float,2>& v) { simd4 = v.v4(); }
simd4_t(const __m128 & v)353     simd4_t(const __m128& v) { simd4 = v; }
354 
v4(void) const355     inline const __m128 &v4(void) const {
356         return simd4;
357     }
v4(void)358     inline __m128 &v4(void) {
359         return simd4;
360     }
361 
__anon3e7c049c0702null362     inline const float (&ptr(void) const)[N] {
363         return vec;
364     }
__anon3e7c049c0802null365     inline float (&ptr(void))[N] {
366         return vec;
367     }
368 
369     inline  const float& operator[](unsigned n) const {
370         assert(n<N);
371         return vec[n];
372     }
373 
operator [](unsigned n)374     inline float& operator[](unsigned n) {
375         assert(n<N);
376         return vec[n];
377     }
378 
379     template<int M>
operator =(const simd4_t<float,M> & v)380     inline simd4_t<float,N>& operator=(const simd4_t<float,M>& v) {
381         simd4 = simd4_t<float,N>(v).v4();
382         return *this;
383     }
operator =(const __m128 & v)384     inline simd4_t<float,N>& operator=(const __m128& v) {
385         simd4 = v;
386         return *this;
387     }
388 
operator +=(float f)389     inline simd4_t<float,N>& operator+=(float f) {
390         simd4 = _mm_add_ps(simd4, _mm_set1_ps(f));
391         return *this;
392     }
operator +=(const simd4_t<float,N> & v)393     inline simd4_t<float,N>& operator+=(const simd4_t<float,N>& v) {
394         simd4 = _mm_add_ps(simd4, v.v4());
395         return *this;
396     }
397 
operator -=(float f)398     inline simd4_t<float,N>& operator-=(float f) {
399         simd4 = _mm_sub_ps(simd4, _mm_set1_ps(f));
400         return *this;
401     }
operator -=(const simd4_t<float,N> & v)402     inline simd4_t<float,N>& operator-=(const simd4_t<float,N>& v) {
403         simd4 = _mm_sub_ps(simd4, v.v4());
404         return *this;
405     }
406 
operator *=(float f)407     inline simd4_t<float,N>& operator*=(float f) {
408         simd4 = _mm_mul_ps(simd4, _mm_set1_ps(f));
409         return *this;
410     }
operator *=(const simd4_t<float,N> & v)411     inline simd4_t<float,N>& operator*=(const simd4_t<float,N>& v) {
412         simd4 = _mm_mul_ps(simd4, v.v4());
413         return *this;
414     }
415 
operator /=(float f)416     inline simd4_t<float,N>& operator/=(float f) {
417         simd4 = _mm_div_ps(simd4, _mm_set1_ps(f));
418         return *this;
419     }
operator /=(const simd4_t<float,N> & v)420     inline simd4_t<float,N>& operator/=(const simd4_t<float,N>& v) {
421         simd4 = _mm_div_ps(simd4, v.v4());
422         return *this;
423     }
424 };
425 
426 static const __m128 fmask2 = _mm_load_ps((const float*)simd4::m2a32);
427 static const __m128 fmask3 = _mm_load_ps((const float*)simd4::m3a32);
428 
429 template<>
simd4_t(float x,float y,float z,float w)430 inline simd4_t<float,4>::simd4_t(float x, float y, float z, float w) {
431     simd4 = _mm_set_ps(w,z,y,x);
432 }
433 template<>
simd4_t(float x,float y,float z,float w)434 inline simd4_t<float,3>::simd4_t(float x, float y, float z, float w) {
435     simd4 = _mm_and_ps(_mm_set_ps(w,z,y,x), fmask3);
436 }
437 template<>
simd4_t(float x,float y,float z,float w)438 inline simd4_t<float,2>::simd4_t(float x, float y, float z, float w) {
439     simd4 = _mm_and_ps(_mm_set_ps(w,z,y,x), fmask2);
440 }
441 
442 template<>
simd4_t(const simd4_t<float,4> & v)443 inline simd4_t<float,4>::simd4_t(const simd4_t<float,4>& v) {
444     simd4 = v.v4();
445 }
446 template<>
simd4_t(const simd4_t<float,3> & v)447 inline simd4_t<float,4>::simd4_t(const simd4_t<float,3>& v) {
448     simd4 = v.v4();
449 }
450 template<>
simd4_t(const simd4_t<float,2> & v)451 inline simd4_t<float,4>::simd4_t(const simd4_t<float,2>& v) {
452     simd4 = v.v4();
453 }
454 template<>
simd4_t(const simd4_t<float,4> & v)455 inline simd4_t<float,3>::simd4_t(const simd4_t<float,4>& v) {
456     simd4 = _mm_and_ps(v.v4(), fmask3);
457 }
458 template<>
simd4_t(const simd4_t<float,3> & v)459 inline simd4_t<float,3>::simd4_t(const simd4_t<float,3>& v) {
460     simd4 = v.v4();
461 }
462 template<>
simd4_t(const simd4_t<float,2> & v)463 inline simd4_t<float,3>::simd4_t(const simd4_t<float,2>& v) {
464     simd4 = v.v4();
465 }
466 template<>
simd4_t(const simd4_t<float,4> & v)467 inline simd4_t<float,2>::simd4_t(const simd4_t<float,4>& v) {
468     simd4 = _mm_and_ps(v.v4(), fmask2);
469 }
470 template<>
simd4_t(const simd4_t<float,3> & v)471 inline simd4_t<float,2>::simd4_t(const simd4_t<float,3>& v) {
472     simd4 = _mm_and_ps(v.v4(), fmask2);
473 }
474 template<>
simd4_t(const simd4_t<float,2> & v)475 inline simd4_t<float,2>::simd4_t(const simd4_t<float,2>& v) {
476     simd4 = v.v4();
477 }
478 
479 template<>
simd4_t(const __vec4f_t v)480 inline simd4_t<float,4>::simd4_t(const __vec4f_t v) {
481     simd4 = _mm_loadu_ps(v);
482 }
483 template<>
simd4_t(const __vec4f_t v)484 inline simd4_t<float,3>::simd4_t(const __vec4f_t v) {
485     simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask3);
486 }
487 template<>
simd4_t(const __vec4f_t v)488 inline simd4_t<float,2>::simd4_t(const __vec4f_t v) {
489     simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask2);
490 }
491 template<>
simd4_t(float f)492 inline simd4_t<float,4>::simd4_t(float f) {
493     simd4 = _mm_set1_ps(f);
494 }
495 template<>
simd4_t(float f)496 inline simd4_t<float,3>::simd4_t(float f) {
497     simd4 = _mm_and_ps(_mm_set1_ps(f), fmask3);
498 }
499 
500 template<>
simd4_t(float f)501 inline simd4_t<float,2>::simd4_t(float f) {
502     simd4 = _mm_and_ps(_mm_set1_ps(f), fmask2);
503 }
504 
505 namespace simd4
506 {
507 // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
508 # ifdef __SSE3__
hsum_ps_sse(__m128 v)509   inline static float hsum_ps_sse(__m128 v) {
510     __m128 shuf = _mm_movehdup_ps(v);
511     __m128 sums = _mm_add_ps(v, shuf);
512     shuf        = _mm_movehl_ps(shuf, sums);
513     sums        = _mm_add_ss(sums, shuf);
514     return        _mm_cvtss_f32(sums);
515   }
516 # else /* SSE */
517   inline static float hsum_ps_sse(__m128 v) {
518     __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
519     __m128 sums = _mm_add_ps(v, shuf);
520     shuf        = _mm_movehl_ps(shuf, sums);
521     sums        = _mm_add_ss(sums, shuf);
522     return      _mm_cvtss_f32(sums);
523   }
524 # endif
525 
526 template<>
magnitude2(const simd4_t<float,4> & v)527 inline float magnitude2(const simd4_t<float,4>& v) {
528     return hsum_ps_sse(_mm_mul_ps(v.v4(),v.v4()));
529 }
530 
531 template<>
dot(const simd4_t<float,4> & v1,const simd4_t<float,4> & v2)532 inline float dot(const simd4_t<float,4>& v1, const simd4_t<float,4>& v2) {
533     return hsum_ps_sse(_mm_mul_ps(v1.v4(),v2.v4()));
534 }
535 
536 template<>
cross(const simd4_t<float,3> & v1,const simd4_t<float,3> & v2)537 inline simd4_t<float,3> cross(const simd4_t<float,3>& v1, const simd4_t<float,3>& v2)
538 {
539     // http://threadlocalmutex.com/?p=8
540     __m128 v41 = v1.v4(), v42 = v2.v4();
541     __m128 a = _mm_shuffle_ps(v41, v41, _MM_SHUFFLE(3, 0, 2, 1));
542     __m128 b = _mm_shuffle_ps(v42, v42, _MM_SHUFFLE(3, 0, 2, 1));
543     __m128 c = _mm_sub_ps(_mm_mul_ps(v41, b), _mm_mul_ps(a, v42));
544     return  _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
545 }
546 
547 template<int N>
min(simd4_t<float,N> v1,const simd4_t<float,N> & v2)548 inline simd4_t<float,N> min(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
549     v1 = _mm_min_ps(v1.v4(), v2.v4());
550     return v1;
551 }
552 
553 template<int N>
max(simd4_t<float,N> v1,const simd4_t<float,N> & v2)554 inline simd4_t<float,N> max(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
555     v1 = _mm_max_ps(v1.v4(), v2.v4());
556     return v1;
557 }
558 
559 template<int N>
abs(simd4_t<float,N> v)560 inline simd4_t<float,N>abs(simd4_t<float,N> v) {
561     static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
562     v = _mm_andnot_ps(sign_mask, v.v4());
563     return v;
564 }
565 
566 } /* namsepace simd4 */
567 
568 # endif
569 
570 
571 # ifdef __AVX_unsupported__
572 template<int N>
573 class alignas(32) simd4_t<double,N>
574 {
575 private:
576    typedef double  __vec4d_t[N];
577 
578     union {
579         __m256d simd4;
580         __vec4d_t vec;
581     };
582 
583 public:
simd4_t(void)584     simd4_t(void) { simd4 = _mm256_setzero_pd(); }
simd4_t(double d)585     simd4_t(double d) { simd4 = _mm256_set1_pd(d); }
simd4_t(double x,double y)586     simd4_t(double x, double y) : simd4_t(x,y,0,0) {}
simd4_t(double x,double y,double z)587     simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {}
simd4_t(double x,double y,double z,double w)588     simd4_t(double x, double y, double z, double w) {
589         simd4 = _mm256_set_pd(w,z,y,x);
590     }
simd4_t(const __vec4d_t v)591     simd4_t(const __vec4d_t v) { simd4 = _mm256_loadu_pd(v); }
simd4_t(const simd4_t<double,4> & v)592     simd4_t(const simd4_t<double,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<double,3> & v)593     simd4_t(const simd4_t<double,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<double,2> & v)594     simd4_t(const simd4_t<double,2>& v) { simd4 = v.v4(); }
simd4_t(const __m256d & v)595     simd4_t(const __m256d& v) { simd4 = v; }
596 
v4(void) const597     inline const __m256d (&v4(void) const) {
598         return simd4;
599     }
v4(void)600     inline __m256d (&v4(void)) {
601         return simd4;
602     }
603 
__anon3e7c049c0a02null604     inline const double (&ptr(void) const)[N] {
605         return vec;
606     }
__anon3e7c049c0b02null607     inline double (&ptr(void))[N] {
608         return vec;
609     }
610 
611     inline  const double& operator[](unsigned n) const {
612         assert(n<N);
613         return vec[n];
614     }
615 
operator [](unsigned n)616     inline double& operator[](unsigned n) {
617         assert(n<N);
618         return vec[n];
619     }
620 
621     template<int M>
operator =(const simd4_t<double,M> & v)622     inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
623         simd4 = simd4_t<double,N>(v).v4();
624         return *this;
625     }
operator =(const __m256d & v)626     inline simd4_t<double,N>& operator=(const __m256d& v) {
627         simd4 = v;
628         return *this;
629     }
630 
operator +=(double d)631     inline simd4_t<double,N>& operator+=(double d) {
632         simd4 = _mm256_add_pd(simd4, _mm256_set1_pd(d));
633         return *this;
634     }
operator +=(const simd4_t<double,N> & v)635     inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
636         simd4 = _mm256_add_pd(simd4, v.v4());
637         return *this;
638     }
639 
operator -=(double d)640     inline simd4_t<double,N>& operator-=(double d) {
641         simd4 = _mm256_sub_pd(simd4, _mm256_set1_pd(d));
642         return *this;
643     }
operator -=(const simd4_t<double,N> & v)644     inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
645         simd4 = _mm256_sub_pd(simd4, v.v4());
646         return *this;
647     }
648 
operator *=(double d)649     inline simd4_t<double,N>& operator*=(double d) {
650         simd4 = _mm256_mul_pd(simd4, _mm256_set1_pd(d));
651         return *this;
652     }
operator *=(const simd4_t<double,N> & v)653     inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
654         simd4 = _mm256_mul_pd(simd4, v.v4());
655         return *this;
656     }
657 
operator /=(double d)658     inline simd4_t<double,N>& operator/=(double d) {
659         simd4 = _mm256_div_pd(simd4, _mm256_set1_pd(d));
660         return *this;
661     }
operator /=(const simd4_t<double,N> & v)662     inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
663         simd4 = _mm256_div_pd(simd4, v.v4());
664         return *this;
665     }
666 };
667 
668 static const __m256d dmask2 = _mm256_load_pd((const double*)simd4::m2a64);
669 static const __m256d dmask3 = _mm256_load_pd((const double*)simd4::m3a64);
670 
671 template<>
simd4_t(double x,double y,double z,double w)672 inline simd4_t<double,4>::simd4_t(double x, double y, double z, double w) {
673     simd4 = _mm256_set_pd(w,z,y,x);
674 }
675 template<>
simd4_t(double x,double y,double z,double w)676 inline simd4_t<double,3>::simd4_t(double x, double y, double z, double w) {
677     simd4 = _mm256_and_pd(_mm256_set_pd(w,z,y,x), dmask3);
678 }
679 template<>
simd4_t(double x,double y,double z,double w)680 inline simd4_t<double,2>::simd4_t(double x, double y, double z, double w) {
681     simd4 = _mm256_and_pd(_mm256_set_pd(w,z,y,x), dmask2);
682 }
683 
684 template<>
simd4_t(const simd4_t<double,4> & v)685 inline simd4_t<double,4>::simd4_t(const simd4_t<double,4>& v) {
686     simd4 = v.v4();
687 }
688 template<>
simd4_t(const simd4_t<double,3> & v)689 inline simd4_t<double,4>::simd4_t(const simd4_t<double,3>& v) {
690     simd4 = v.v4();
691 }
692 template<>
simd4_t(const simd4_t<double,2> & v)693 inline simd4_t<double,4>::simd4_t(const simd4_t<double,2>& v) {
694     simd4 = v.v4();
695 }
696 template<>
simd4_t(const simd4_t<double,4> & v)697 inline simd4_t<double,3>::simd4_t(const simd4_t<double,4>& v) {
698     simd4 = _mm256_and_pd(v.v4(), dmask3);
699 }
700 template<>
simd4_t(const simd4_t<double,3> & v)701 inline simd4_t<double,3>::simd4_t(const simd4_t<double,3>& v) {
702     simd4 = v.v4();
703 }
704 template<>
simd4_t(const simd4_t<double,2> & v)705 inline simd4_t<double,3>::simd4_t(const simd4_t<double,2>& v) {
706     simd4 = v.v4();
707 }
708 template<>
simd4_t(const simd4_t<double,4> & v)709 inline simd4_t<double,2>::simd4_t(const simd4_t<double,4>& v) {
710     simd4 = _mm256_and_pd(v.v4(), dmask2);
711 }
712 template<>
simd4_t(const simd4_t<double,3> & v)713 inline simd4_t<double,2>::simd4_t(const simd4_t<double,3>& v) {
714     simd4 = _mm256_and_pd(v.v4(), dmask2);
715 }
716 template<>
simd4_t(const simd4_t<double,2> & v)717 inline simd4_t<double,2>::simd4_t(const simd4_t<double,2>& v) {
718     simd4 = v.v4();
719 }
720 
721 template<>
simd4_t(const __vec4d_t v)722 inline simd4_t<double,4>::simd4_t(const __vec4d_t v) {
723     simd4 = _mm256_loadu_pd(v);
724 }
725 template<>
simd4_t(const __vec4d_t v)726 inline simd4_t<double,3>::simd4_t(const __vec4d_t v) {
727     simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask3);
728 }
729 template<>
simd4_t(const __vec4d_t v)730 inline simd4_t<double,2>::simd4_t(const __vec4d_t v) {
731     simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask2);
732 }
733 template<>
simd4_t(double d)734 inline simd4_t<double,4>::simd4_t(double d) {
735     simd4 = _mm256_set1_pd(d);
736 }
737 template<>
simd4_t(double d)738 inline simd4_t<double,3>::simd4_t(double d) {
739     simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask3);
740 }
741 template<>
simd4_t(double d)742 inline simd4_t<double,2>::simd4_t(double d) {
743     simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask2);
744 }
745 
746 namespace simd4
747 {
748 // http://berenger.eu/blog/sseavxsimd-horizontal-sum-sum-simd-vector-intrinsic/
hsum_pd_avx(__m256d v)749 inline static double hsum_pd_avx(__m256d v) {
750     const __m128d valupper = _mm256_extractf128_pd(v, 1);
751     const __m128d vallower = _mm256_castpd256_pd128(v);
752     _mm256_zeroupper();
753     const __m128d valval = _mm_add_pd(valupper, vallower);
754     const __m128d sums =   _mm_add_pd(_mm_permute_pd(valval,1), valval);
755     return                 _mm_cvtsd_f64(sums);
756 }
757 
758 template<>
magnitude2(const simd4_t<double,4> & v)759 inline double magnitude2(const simd4_t<double,4>& v) {
760     return hsum_pd_avx(_mm256_mul_pd(v.v4(),v.v4()));
761 }
762 
763 template<>
dot(const simd4_t<double,4> & v1,const simd4_t<double,4> & v2)764 inline double dot(const simd4_t<double,4>& v1, const simd4_t<double,4>& v2) {
765     return hsum_pd_avx(_mm256_mul_pd(v1.v4(),v2.v4()));
766 }
767 
768 #  ifdef __AVX2__
769 template<>
cross(const simd4_t<double,3> & v1,const simd4_t<double,3> & v2)770 inline simd4_t<double,3> cross(const simd4_t<double,3>& v1, const simd4_t<double,3>& v2)
771 {
772     // https://gist.github.com/L2Program/219e07581e69110e7942
773     __m256d v41 = v1.v4(), v42 = v2.v4();
774     return _mm256_sub_pd(
775                _mm256_mul_pd(
776                      _mm256_permute4x64_pd(v41,_MM_SHUFFLE(3, 0, 2, 1)),
777                      _mm256_permute4x64_pd(v42,_MM_SHUFFLE(3, 1, 0, 2))),
778                _mm256_mul_pd(
779                      _mm256_permute4x64_pd(v41,_MM_SHUFFLE(3, 1, 0, 2)),
780                      _mm256_permute4x64_pd(v42,_MM_SHUFFLE(3, 0, 2, 1))));
781 }
782 #  endif
783 
784 template<int N>
min(simd4_t<double,N> v1,const simd4_t<double,N> & v2)785 inline simd4_t<double,N> min(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
786     v1 = _mm256_min_pd(v1.v4(), v2.v4());
787     return v1;
788 }
789 
790 template<int N>
max(simd4_t<double,N> v1,const simd4_t<double,N> & v2)791 inline simd4_t<double,N> max(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
792     v1 = _mm256_max_pd(v1.v4(), v2.v4());
793     return v1;
794 }
795 
796 template<int N>
abs(simd4_t<double,N> v)797 inline simd4_t<double,N>abs(simd4_t<double,N> v) {
798     static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
799     v = _mm256_andnot_pd(sign_mask, v.v4());
800     return v;
801 }
802 
803 } /* namespace simd4 */
804 
805 # elif defined __SSE2__
806 
807 template<int N>
808 class alignas(16) simd4_t<double,N>
809 {
810 private:
811    typedef double  __vec4d_t[N];
812 
813     union {
814         __m128d simd4[2];
815         __vec4d_t vec;
816     };
817 
818 public:
simd4_t(void)819     simd4_t(void) { simd4[0] = simd4[1] = _mm_setzero_pd(); }
simd4_t(double d)820     simd4_t(double d) { simd4[0] = simd4[1] = _mm_set1_pd(d); }
simd4_t(double x,double y)821     simd4_t(double x, double y) : simd4_t(x,y,0,0) {}
simd4_t(double x,double y,double z)822     simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {}
simd4_t(double x,double y,double z,double w)823     simd4_t(double x, double y, double z, double w) {
824         simd4[0] = _mm_set_pd(y,x); simd4[1] = _mm_set_pd(w,z);
825     }
simd4_t(const __vec4d_t v)826     simd4_t(const __vec4d_t v) {
827         simd4[0] = _mm_loadu_pd(v); simd4[1] = _mm_loadu_pd(v+2);
828     }
simd4_t(const simd4_t<double,4> & v)829     simd4_t(const simd4_t<double,4>& v) {
830         simd4[0] = v.v4()[0]; simd4[1] = v.v4()[1];
831     }
simd4_t(const simd4_t<double,3> & v)832     simd4_t(const simd4_t<double,3>& v) {
833         simd4[0] = v.v4()[0]; simd4[1] = v.v4()[1];
834     }
simd4_t(const simd4_t<double,2> & v)835     simd4_t(const simd4_t<double,2>& v) {
836         simd4[0] = v.v4()[0]; simd4[1] = _mm_setzero_pd();
837     }
simd4_t(const __m128d v[2])838     simd4_t(const __m128d v[2]) {
839         simd4[0] = v[0];
840         simd4[1] = v[1];
841     }
842 
__anon3e7c049c0d02null843     inline const __m128d (&v4(void) const)[2] {
844         return simd4;
845     }
__anon3e7c049c0e02null846     inline __m128d (&v4(void))[2] {
847         return simd4;
848     }
849 
__anon3e7c049c0f02null850     inline const double (&ptr(void) const)[N] {
851         return vec;
852     }
__anon3e7c049c1002null853     inline double (&ptr(void))[N] {
854         return vec;
855     }
856 
857     inline  const double& operator[](unsigned n) const {
858         assert(n<N);
859         return vec[n];
860     }
861 
operator [](unsigned n)862     inline double& operator[](unsigned n) {
863         assert(n<N);
864         return vec[n];
865     }
866 
867     template<int M>
operator =(const simd4_t<double,M> & v)868     inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
869         simd4_t<double,N> n(v);
870         simd4[0] = n.v4()[0];
871         simd4[1] = n.v4()[1];
872         return *this;
873     }
operator =(const __m128d v[2])874     inline simd4_t<double,N>& operator=(const __m128d v[2]) {
875         simd4[0] = v[0];
876         simd4[1] = v[1];
877         return *this;
878     }
879 
operator +=(double d)880     inline simd4_t<double,N>& operator+=(double d) {
881         __m128d d4 = _mm_set1_pd(d);
882         simd4[0] = _mm_add_pd(simd4[0], d4);
883         simd4[1] = _mm_add_pd(simd4[1], d4);
884         return *this;
885     }
operator +=(const simd4_t<double,N> & v)886     inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
887         simd4[0] = _mm_add_pd(simd4[0], v.v4()[0]);
888         simd4[1] = _mm_add_pd(simd4[1], v.v4()[1]);
889         return *this;
890     }
891 
operator -=(double d)892     inline simd4_t<double,N>& operator-=(double d) {
893         __m128d d4 = _mm_set1_pd(d);
894         simd4[0] = _mm_sub_pd(simd4[0], d4);
895         simd4[1] = _mm_sub_pd(simd4[1], d4);
896     }
operator -=(const simd4_t<double,N> & v)897     inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
898         simd4[0] = _mm_sub_pd(simd4[0], v.v4()[0]);
899         simd4[1] = _mm_sub_pd(simd4[1], v.v4()[1]);
900         return *this;
901     }
902 
operator *=(double d)903     inline simd4_t<double,N>& operator*=(double d) {
904         __m128d d4 = _mm_set1_pd(d);
905         simd4[0] = _mm_mul_pd(simd4[0], d4);
906         simd4[1] = _mm_mul_pd(simd4[1], d4);
907         return *this;
908     }
operator *=(const simd4_t<double,N> & v)909     inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
910         simd4[0] = _mm_mul_pd(simd4[0], v.v4()[0]);
911         simd4[1] = _mm_mul_pd(simd4[1], v.v4()[1]);
912         return *this;
913     }
914 
operator /=(double d)915     inline simd4_t<double,N>& operator/=(double d) {
916         __m128d d4 = _mm_set1_pd(d);
917         simd4[0] = _mm_div_pd(simd4[0], d4);
918         simd4[1] = _mm_div_pd(simd4[1], d4);
919     }
operator /=(const simd4_t<double,N> & v)920     inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
921         simd4[0] = _mm_div_pd(simd4[0], v.v4()[0]);
922         simd4[1] = _mm_div_pd(simd4[1], v.v4()[1]);
923         return *this;
924     }
925 };
926 
927 static const __m128d dmask3 = _mm_load_pd((const double*)(simd4::m3a64+2));
928 
929 template<>
simd4_t(double x,double y,double z,double w)930 inline simd4_t<double,4>::simd4_t(double x, double y, double z, double w) {
931     simd4[0] = _mm_set_pd(y,x);
932     simd4[1] = _mm_set_pd(w,z);
933 }
934 template<>
simd4_t(double x,double y,double z,double w)935 inline simd4_t<double,3>::simd4_t(double x, double y, double z, double w) {
936     simd4[0] = _mm_set_pd(y,x);
937     simd4[1] = _mm_and_pd(_mm_set_pd(w,z), dmask3);
938 }
939 template<>
simd4_t(double x,double y,double z,double w)940 inline simd4_t<double,2>::simd4_t(double x, double y, double z, double w) {
941     simd4[0] = _mm_set_pd(y,x);
942     simd4[1] = _mm_setzero_pd();
943 }
944 
945 template<>
simd4_t(const simd4_t<double,4> & v)946 inline simd4_t<double,4>::simd4_t(const simd4_t<double,4>& v) {
947     simd4[0] = v.v4()[0];
948     simd4[1] = v.v4()[1];
949 }
950 template<>
simd4_t(const simd4_t<double,3> & v)951 inline simd4_t<double,4>::simd4_t(const simd4_t<double,3>& v) {
952     simd4[0] = v.v4()[0];
953     simd4[1] = v.v4()[1];
954 }
955 template<>
simd4_t(const simd4_t<double,2> & v)956 inline simd4_t<double,4>::simd4_t(const simd4_t<double,2>& v) {
957     simd4[0] = v.v4()[0];
958     simd4[1] = v.v4()[1];
959 }
960 template<>
simd4_t(const simd4_t<double,4> & v)961 inline simd4_t<double,3>::simd4_t(const simd4_t<double,4>& v) {
962     simd4[0] = v.v4()[0];
963     simd4[1] = _mm_and_pd(v.v4()[1], dmask3);
964 }
965 template<>
simd4_t(const simd4_t<double,3> & v)966 inline simd4_t<double,3>::simd4_t(const simd4_t<double,3>& v) {
967     simd4[0] = v.v4()[0];
968     simd4[1] = v.v4()[1];
969 }
970 template<>
simd4_t(const simd4_t<double,2> & v)971 inline simd4_t<double,3>::simd4_t(const simd4_t<double,2>& v) {
972     simd4[0] = v.v4()[0];
973     simd4[1] = _mm_setzero_pd();
974 }
975 template<>
simd4_t(const simd4_t<double,4> & v)976 inline simd4_t<double,2>::simd4_t(const simd4_t<double,4>& v) {
977     simd4[0] = v.v4()[0];
978     simd4[1] = _mm_setzero_pd();
979 }
980 template<>
simd4_t(const simd4_t<double,3> & v)981 inline simd4_t<double,2>::simd4_t(const simd4_t<double,3>& v) {
982     simd4[0] = v.v4()[0];
983     simd4[1] = _mm_setzero_pd();
984 }
985 template<>
simd4_t(const simd4_t<double,2> & v)986 inline simd4_t<double,2>::simd4_t(const simd4_t<double,2>& v) {
987     simd4[0] = v.v4()[0];
988     simd4[1] = _mm_setzero_pd();
989 }
990 
991 template<>
simd4_t(const __vec4d_t v)992 inline simd4_t<double,4>::simd4_t(const __vec4d_t v) {
993     simd4[0] = _mm_loadu_pd(v);
994     simd4[1] = _mm_loadu_pd(v+2);
995 }
996 template<>
simd4_t(const __vec4d_t v)997 inline simd4_t<double,3>::simd4_t(const __vec4d_t v) {
998     simd4[0] = _mm_loadu_pd(v);
999     simd4[1] = _mm_and_pd(_mm_loadu_pd(v+2), dmask3);
1000 }
1001 template<>
simd4_t(const __vec4d_t v)1002 inline simd4_t<double,2>::simd4_t(const __vec4d_t v) {
1003     simd4[0] = _mm_loadu_pd(v);
1004     simd4[1] = _mm_setzero_pd();
1005 }
1006 template<>
simd4_t(double d)1007 inline simd4_t<double,4>::simd4_t(double d) {
1008     simd4[0] = simd4[1] = _mm_set1_pd(d);
1009 }
1010 template<>
simd4_t(double d)1011 inline simd4_t<double,3>::simd4_t(double d) {
1012     simd4[0] = _mm_set1_pd(d);
1013     simd4[1] = _mm_and_pd(simd4[0], dmask3);
1014 }
1015 template<>
simd4_t(double d)1016 inline simd4_t<double,2>::simd4_t(double d) {
1017     simd4[0] = _mm_set1_pd(d);
1018     simd4[1] = _mm_setzero_pd();
1019 }
1020 
1021 namespace simd4
1022 {
1023 // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
hsum_pd_sse(const __m128d vd[2])1024 inline static double hsum_pd_sse(const __m128d vd[2]) {
1025     __m128 undef    = _mm_setzero_ps();
1026     __m128 shuftmp1 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[0]));
1027     __m128 shuftmp2 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[1]));
1028     __m128d shuf1   = _mm_castps_pd(shuftmp1);
1029     __m128d shuf2   = _mm_castps_pd(shuftmp2);
1030     return  _mm_cvtsd_f64(_mm_add_sd(vd[0], shuf1)) +
1031             _mm_cvtsd_f64(_mm_add_sd(vd[1], shuf2));
1032 }
1033 
1034 template<>
magnitude2(const simd4_t<double,4> & v)1035 inline double magnitude2(const simd4_t<double,4>& v) {
1036     __m128d v2[2];
1037     v2[0] = _mm_mul_pd(v.v4()[0],v.v4()[0]);
1038     v2[1] = _mm_mul_pd(v.v4()[1],v.v4()[1]);
1039     return hsum_pd_sse(v2);
1040 }
1041 
1042 template<>
dot(const simd4_t<double,4> & v1,const simd4_t<double,4> & v2)1043 inline double dot(const simd4_t<double,4>& v1, const simd4_t<double,4>& v2) {
1044     __m128d mv[2];
1045     mv[0] = _mm_mul_pd(v1.v4()[0],v2.v4()[0]);
1046     mv[1] = _mm_mul_pd(v1.v4()[1],v2.v4()[1]);
1047     return hsum_pd_sse(mv);
1048 }
1049 
1050 template<>
cross(const simd4_t<double,3> & v1,const simd4_t<double,3> & v2)1051 inline simd4_t<double,3> cross(const simd4_t<double,3>& v1, const simd4_t<double,3>& v2)
1052 {
1053     __m128d a[2], b[2], c[2];
1054     simd4_t<double,3> r;
1055 
1056     a[0] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(0, 1));
1057     a[1] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(1, 0));
1058 
1059     b[0] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(0, 1));
1060     b[1] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(1, 0));
1061 
1062     c[0] = _mm_sub_pd(_mm_mul_pd(v1.v4()[0], b[0]), _mm_mul_pd(a[0], v2.v4()[0]));
1063     c[1] = _mm_sub_pd(_mm_mul_pd(v1.v4()[1], b[1]), _mm_mul_pd(a[1], v2.v4()[1]));
1064 
1065     r.v4()[0] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(0, 1));
1066     r.v4()[1] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(1, 0));
1067 
1068     return r;
1069 }
1070 
1071 template<int N>
min(simd4_t<double,N> v1,const simd4_t<double,N> & v2)1072 inline simd4_t<double,N> min(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
1073     v1.v4()[0] = _mm_min_pd(v1.v4()[0], v2.v4()[0]);
1074     v1.v4()[1] = _mm_min_pd(v1.v4()[1], v2.v4()[1]);
1075     return v1;
1076 }
1077 
1078 template<int N>
max(simd4_t<double,N> v1,const simd4_t<double,N> & v2)1079 inline simd4_t<double,N> max(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
1080     v1.v4()[0] = _mm_max_pd(v1.v4()[0], v2.v4()[0]);
1081     v1.v4()[1] = _mm_max_pd(v1.v4()[1], v2.v4()[1]);
1082     return v1;
1083 }
1084 
1085 template<int N>
abs(simd4_t<double,N> v)1086 inline simd4_t<double,N>abs(simd4_t<double,N> v) {
1087     static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63
1088     v.v4()[0] = _mm_andnot_pd(sign_mask, v.v4()[0]);
1089     v.v4()[1] = _mm_andnot_pd(sign_mask, v.v4()[1]);
1090     return v;
1091 }
1092 
1093 } /* namespace simd4 */
1094 
1095 # endif
1096 
1097 
1098 # ifdef __SSE2__
1099 
1100 template<int N>
1101 class alignas(16) simd4_t<int,N>
1102 {
1103 private:
1104    typedef int  __vec4i_t[N];
1105 
1106     union {
1107         __m128i simd4;
1108         __vec4i_t vec;
1109     };
1110 
1111 public:
simd4_t(void)1112     simd4_t(void) { simd4 = _mm_setzero_si128(); }
simd4_t(int i)1113     simd4_t(int i) { simd4 = _mm_set1_epi32(i); }
simd4_t(int x,int y)1114     simd4_t(int x, int y) : simd4_t(x,y,0,0) {}
simd4_t(int x,int y,int z)1115     simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {}
simd4_t(int x,int y,int z,int w)1116     simd4_t(int x, int y, int z, int w) { simd4 = _mm_set_epi32(w,z,y,x); }
simd4_t(const __vec4i_t v)1117     simd4_t(const __vec4i_t v) { simd4 = _mm_loadu_si128((const __m128i*)v); }
simd4_t(const simd4_t<int,4> & v)1118     simd4_t(const simd4_t<int,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<int,3> & v)1119     simd4_t(const simd4_t<int,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<int,2> & v)1120     simd4_t(const simd4_t<int,2>& v) { simd4 = v.v4(); }
simd4_t(const __m128i & v)1121     simd4_t(const __m128i& v) { simd4 = v; }
1122 
v4(void)1123     inline __m128i &v4(void) {
1124         return simd4;
1125     }
1126 
v4(void) const1127     inline const __m128i &v4(void) const {
1128         return simd4;
1129     }
1130 
__anon3e7c049c1202null1131     inline const int (&ptr(void) const)[N] {
1132         return vec;
1133     }
1134 
__anon3e7c049c1302null1135     inline int (&ptr(void))[N] {
1136         return vec;
1137     }
1138 
1139     inline  const int& operator[](unsigned n) const {
1140         assert(n<N);
1141         return vec[n];
1142     }
1143 
operator [](unsigned n)1144     inline int& operator[](unsigned n) {
1145         assert(n<N);
1146         return vec[n];
1147     }
1148 
1149     template<int M>
operator =(const simd4_t<int,M> & v)1150     inline simd4_t<int,N>& operator=(const simd4_t<int,M>& v) {
1151         simd4 = simd4_t<int,N>(v).v4();
1152         return *this;
1153     }
operator =(const __m128 & v)1154     inline simd4_t<int,N>& operator=(const __m128& v) {
1155         simd4 = v;
1156         return *this;
1157     }
1158 
operator +=(int i)1159     inline simd4_t<int,N>& operator+=(int i) {
1160         simd4 = _mm_add_epi32(simd4, _mm_set1_epi32(i));
1161         return *this;
1162     }
operator +=(const simd4_t<int,N> & v)1163     inline simd4_t<int,N>& operator+=(const simd4_t<int,N>& v) {
1164         simd4 = _mm_add_epi32(simd4, v.v4());
1165         return *this;
1166     }
1167 
operator -=(int i)1168     inline simd4_t<int,N>& operator-=(int i) {
1169         simd4 = _mm_sub_epi32(simd4, _mm_set1_epi32(i));
1170         return *this;
1171     }
operator -=(const simd4_t<int,N> & v)1172     inline simd4_t<int,N>& operator-=(const simd4_t<int,N>& v) {
1173         simd4 = _mm_sub_epi32(simd4, v.v4());
1174         return *this;
1175     }
1176 
operator *=(int i)1177     inline simd4_t<int,N>& operator*=(int i) {
1178         return operator*=(_mm_set1_epi32(i));
1179     }
operator *=(const simd4_t<int,N> & v)1180     inline simd4_t<int,N>& operator*=(const simd4_t<int,N>& v) {
1181          return operator*=(v.v4());
1182     }
1183     // https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
operator *=(const __m128i & v)1184     inline simd4_t<int,N>& operator*=(const __m128i& v) {
1185 #ifdef __SSE4_1__
1186         simd4 = _mm_mullo_epi32(simd4, v);
1187 #else
1188         __m128i tmp1 = _mm_mul_epu32(simd4, v);
1189         __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(simd4,4),
1190                        _mm_srli_si128(v,4));
1191         simd4 =_mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1,_MM_SHUFFLE (0,0,2,0)),
1192                                   _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0)));
1193 #endif
1194         return *this;
1195     }
1196 
operator /=(int s)1197     inline simd4_t<int,N>& operator/=(int s) {
1198         return operator/=(simd4_t<int,4>(s));
1199     }
operator /=(const simd4_t<int,N> & v)1200     inline simd4_t<int,N>& operator/=(const simd4_t<int,N>& v) {
1201         for (int i=0; i<N; ++i) {
1202            vec[i] /= v[i];
1203         }
1204         return *this;
1205     }
1206 };
1207 
1208 static const __m128i imask2 = _mm_load_si128((__m128i*)simd4::m2a32);
1209 static const __m128i imask3 = _mm_load_si128((__m128i*)simd4::m3a32);
1210 
1211 template<>
simd4_t(int x,int y,int z,int w)1212 inline simd4_t<int,4>::simd4_t(int x, int y, int z, int w) {
1213     simd4 = _mm_set_epi32(w,z,y,x);
1214 }
1215 template<>
simd4_t(int x,int y,int z,int w)1216 inline simd4_t<int,3>::simd4_t(int x, int y, int z, int w) {
1217     simd4 = _mm_and_si128(_mm_set_epi32(w,z,y,x), imask3);
1218 }
1219 template<>
simd4_t(int x,int y,int z,int w)1220 inline simd4_t<int,2>::simd4_t(int x, int y, int z, int w) {
1221     simd4 = _mm_and_si128(_mm_set_epi32(w,z,y,x), imask2);
1222 }
1223 
1224 template<>
simd4_t(const simd4_t<int,4> & v)1225 inline simd4_t<int,4>::simd4_t(const simd4_t<int,4>& v) {
1226     simd4 = v.v4();
1227 }
1228 template<>
simd4_t(const simd4_t<int,3> & v)1229 inline simd4_t<int,4>::simd4_t(const simd4_t<int,3>& v) {
1230     simd4 = v.v4();
1231 }
1232 template<>
simd4_t(const simd4_t<int,2> & v)1233 inline simd4_t<int,4>::simd4_t(const simd4_t<int,2>& v) {
1234     simd4 = v.v4();
1235 }
1236 template<>
simd4_t(const simd4_t<int,4> & v)1237 inline simd4_t<int,3>::simd4_t(const simd4_t<int,4>& v) {
1238     simd4 = _mm_and_si128(v.v4(), imask3);
1239 }
1240 template<>
simd4_t(const simd4_t<int,3> & v)1241 inline simd4_t<int,3>::simd4_t(const simd4_t<int,3>& v) {
1242     simd4 = v.v4();
1243 }
1244 template<>
simd4_t(const simd4_t<int,2> & v)1245 inline simd4_t<int,3>::simd4_t(const simd4_t<int,2>& v) {
1246     simd4 = v.v4();
1247 }
1248 template<>
simd4_t(const simd4_t<int,4> & v)1249 inline simd4_t<int,2>::simd4_t(const simd4_t<int,4>& v) {
1250     simd4 = _mm_and_si128(v.v4(), imask2);
1251 }
1252 template<>
simd4_t(const simd4_t<int,3> & v)1253 inline simd4_t<int,2>::simd4_t(const simd4_t<int,3>& v) {
1254     simd4 = _mm_and_si128(v.v4(), imask2);
1255 }
1256 template<>
simd4_t(const simd4_t<int,2> & v)1257 inline simd4_t<int,2>::simd4_t(const simd4_t<int,2>& v) {
1258     simd4 = v.v4();
1259 }
1260 
1261 template<>
simd4_t(const __vec4i_t v)1262 inline simd4_t<int,4>::simd4_t(const __vec4i_t v) {
1263     simd4 = _mm_loadu_si128((__m128i*)v);
1264 }
1265 template<>
simd4_t(const __vec4i_t v)1266 inline simd4_t<int,3>::simd4_t(const __vec4i_t v) {
1267     simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask3);
1268 }
1269 template<>
simd4_t(const __vec4i_t v)1270 inline simd4_t<int,2>::simd4_t(const __vec4i_t v) {
1271     simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask2);
1272 }
1273 template<>
simd4_t(int i)1274 inline simd4_t<int,4>::simd4_t(int i) {
1275     simd4 = _mm_set1_epi32(i);
1276 }
1277 template<>
simd4_t(int i)1278 inline simd4_t<int,3>::simd4_t(int i) {
1279     simd4 = _mm_and_si128(_mm_set1_epi32(i), imask3);
1280 }
1281 template<>
simd4_t(int i)1282 inline simd4_t<int,2>::simd4_t(int i) {
1283     simd4 = _mm_and_si128(_mm_set1_epi32(i), imask2);
1284 }
1285 
1286 
1287 namespace simd4
1288 {
1289 
1290 #  ifdef __SSE4_1__
1291 template<int N>
min(simd4_t<int,N> v1,const simd4_t<int,N> & v2)1292 inline simd4_t<int,N> min(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {
1293     v1 = _mm_min_epi32(v1.v4(), v2.v4());
1294     return v1;
1295 }
1296 
1297 template<int N>
max(simd4_t<int,N> v1,const simd4_t<int,N> & v2)1298 inline simd4_t<int,N> max(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {
1299     v1 = _mm_max_epi32(v1.v4(), v2.v4());
1300     return v1;
1301 }
1302 #  endif /* __SSE4_1__ */
1303 
1304 } /* namespace simd4 */
1305 
1306 # endif
1307 
1308 #endif /* ENABLE_SIMD_CODE */
1309 
1310 #endif /* __SIMD_H__ */
1311 
1312