1 // Copyright (C) 2016-2017 Erik Hofman - erik@ehofman.com
2 //
3 // This library is free software; you can redistribute it and/or
4 // modify it under the terms of the GNU Library General Public
5 // License as published by the Free Software Foundation; either
6 // version 2 of the License, or (at your option) any later version.
7 //
8 // This library is distributed in the hope that it will be useful,
9 // but WITHOUT ANY WARRANTY; without even the implied warranty of
10 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 // Library General Public License for more details.
12 //
13 // You should have received a copy of the GNU General Public License
14 // along with this program; if not, write to the Free Software
15 // Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
16 //
17
18 #ifndef __SIMD_H__
19 #define __SIMD_H__ 1
20
21 #ifdef HAVE_CONFIG_H
22 # include <simgear/simgear_config.h>
23 #endif
24
25 #include <cstdint>
26 #include <cstring>
27 #include <cassert>
28 #include <cmath>
29 #include <new>
30
31 #if defined(_MSC_VER)
32 # include <intrin.h>
33 #elif defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__))
34 # include <x86intrin.h>
35 #elif defined(__GNUC__) && defined(__ARM_NEON__)
36 # include <arm_neon.h>
37 # include <simgear/math/simd_neon.hxx>
38 #endif
39
40 #include <simgear/math/SGLimits.hxx>
41 #include <simgear/math/SGMisc.hxx>
42
43 template<typename T, int N> class simd4_t;
44
45 namespace simd4
46 {
47
48 template<typename T, int N>
min(simd4_t<T,N> v1,const simd4_t<T,N> & v2)49 inline simd4_t<T,N> min(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
50 for (int i=0; i<N; ++i) {
51 v1[i] = SGMisc<T>::min(v1[i], v2[i]);
52 }
53 return v1;
54 }
55
56 template<typename T, int N>
max(simd4_t<T,N> v1,const simd4_t<T,N> & v2)57 inline simd4_t<T,N> max(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
58 for (int i=0; i<N; ++i) {
59 v1[i] = SGMisc<T>::max(v1[i], v2[i]);
60 }
61 return v1;
62 }
63
64 template<typename T, int N>
abs(simd4_t<T,N> v)65 inline simd4_t<T,N> abs(simd4_t<T,N> v) {
66 for (int i=0; i<N; ++i) {
67 v[i] = std::abs(v[i]);
68 }
69 return v;
70 }
71
72 template<typename T, int N>
magnitude2(const simd4_t<T,N> & vi)73 inline T magnitude2(const simd4_t<T,N>& vi) {
74 simd4_t<T,N> v(vi);
75 T mag2 = 0;
76 v = v*v;
77 for (int i=0; i<N; ++i) {
78 mag2 += v[i];
79 }
80 return mag2;
81 }
82
83 template<typename T, int N>
interpolate(T tau,const simd4_t<T,N> & v1,const simd4_t<T,N> & v2)84 inline simd4_t<T,N> interpolate(T tau, const simd4_t<T,N>& v1, const simd4_t<T,N>& v2) {
85 return (T(1)-tau)*v1 + tau*v2;
86 }
87
88 template<typename T, int N>
magnitude(const simd4_t<T,N> & v)89 inline T magnitude(const simd4_t<T,N>& v) {
90 return std::sqrt(magnitude2(v));
91 }
92
93 template<typename T, int N>
normalize(simd4_t<T,N> & v)94 inline T normalize(simd4_t<T,N>& v) {
95 T mag = simd4::magnitude(v);
96 if (mag) {
97 v /= mag;
98 } else {
99 v = T(0);
100 }
101 return mag;
102 }
103
104 template<typename T, int N>
dot(const simd4_t<T,N> & v1,const simd4_t<T,N> & v2)105 inline T dot(const simd4_t<T,N>& v1, const simd4_t<T,N>& v2) {
106 simd4_t<T,N> v(v1*v2);
107 T dp = T(0);
108 for (int i=0; i<N; ++i) {
109 dp += v[i];
110 }
111 return dp;
112 }
113
114 template<typename T>
cross(const simd4_t<T,3> & v1,const simd4_t<T,3> & v2)115 inline simd4_t<T,3> cross(const simd4_t<T,3>& v1, const simd4_t<T,3>& v2)
116 {
117 simd4_t<T,3> d;
118 d[0] = v1[1]*v2[2] - v1[2]*v2[1];
119 d[1] = v1[2]*v2[0] - v1[0]*v2[2];
120 d[2] = v1[0]*v2[1] - v1[1]*v2[0];
121 return d;
122 }
123
124 } /* namespace simd4 */
125
126
127 template<typename T, int N>
128 class simd4_t
129 {
130 private:
131 union {
132 T _v4[4];
133 T vec[N];
134 };
135
136 public:
simd4_t(void)137 simd4_t(void) {
138 for (int i=0; i<4; ++i) _v4[i] = 0;
139 }
simd4_t(T s)140 simd4_t(T s) {
141 for (int i=0; i<N; ++i) vec[i] = s;
142 for (int i=N; i<4; ++i) _v4[i] = 0;
143 }
simd4_t(T x,T y)144 simd4_t(T x, T y) : simd4_t(x,y,0,0) {}
simd4_t(T x,T y,T z)145 simd4_t(T x, T y, T z) : simd4_t(x,y,z,0) {}
simd4_t(T x,T y,T z,T w)146 simd4_t(T x, T y, T z, T w) {
147 _v4[0] = x; _v4[1] = y; _v4[2] = z; _v4[3] = w;
148 for (int i=N; i<4; ++i) _v4[i] = 0;
149 }
simd4_t(const T v[N])150 explicit simd4_t(const T v[N]) {
151 std::memcpy(vec, v, sizeof(T[N]));
152 for (int i=N; i<4; ++i) _v4[i] = 0;
153 }
154 template<int M>
simd4_t(const simd4_t<T,M> & v)155 simd4_t(const simd4_t<T,M>& v) {
156 std::memcpy(_v4, v.ptr(), sizeof(T[M]));
157 for (int i=(M<N)?M:N; i<4; ++i) _v4[i] = 0;
158 }
~simd4_t(void)159 ~simd4_t(void) {}
160
__anon3e7c049c0202null161 inline T (&v4(void))[N] {
162 return vec;
163 }
164
__anon3e7c049c0302null165 inline const T (&v4(void) const)[N] {
166 return vec;
167 }
168
__anon3e7c049c0402null169 inline const T (&ptr(void) const)[N] {
170 return vec;
171 }
172
__anon3e7c049c0502null173 inline T (&ptr(void))[N] {
174 return vec;
175 }
176
177 inline const T& operator[](unsigned n) const {
178 assert(n<N);
179 return vec[n];
180 }
181
operator [](unsigned n)182 inline T& operator[](unsigned n) {
183 assert(n<N);
184 return vec[n];
185 }
186
187 template<int M>
operator =(const simd4_t<T,M> & v)188 inline simd4_t<T,N>& operator=(const simd4_t<T,M>& v) {
189 *this = simd4_t<T,N>(v);
190 return *this;
191 }
192
operator +=(T s)193 inline simd4_t<T,N>& operator+=(T s) {
194 for (int i=0; i<N; ++i) {
195 vec[i] += s;
196 }
197 return *this;
198 }
operator +=(const T v[N])199 inline simd4_t<T,N>& operator+=(const T v[N]) {
200 for (int i=0; i<N; ++i) {
201 vec[i] += v[i];
202 }
203 return *this;
204 }
operator +=(const simd4_t<T,N> & v)205 inline simd4_t<T,N>& operator+=(const simd4_t<T,N>& v) {
206 for (int i=0; i<N; ++i) {
207 vec[i] += v[i];
208 }
209 return *this;
210 }
211
operator -=(T s)212 inline simd4_t<T,N>& operator-=(T s) {
213 for (int i=0; i<N; ++i) {
214 vec[i] -= s;
215 }
216 return *this;
217 }
operator -=(const T v[N])218 inline simd4_t<T,N>& operator-=(const T v[N]) {
219 for (int i=0; i<N; ++i) {
220 vec[i] -= v[i];
221 }
222 return *this;
223 }
operator -=(const simd4_t<T,N> & v)224 inline simd4_t<T,N>& operator-=(const simd4_t<T,N>& v) {
225 for (int i=0; i<N; ++i) {
226 vec[i] -= v[i];
227 }
228 return *this;
229 }
operator *=(T s)230 inline simd4_t<T,N>& operator*=(T s) {
231 for (int i=0; i<N; ++i) {
232 vec[i] *= s;
233 }
234 return *this;
235 }
operator *=(const T v[N])236 inline simd4_t<T,N>& operator*=(const T v[N]) {
237 for (int i=0; i<N; ++i) {
238 vec[i] *= v[i];
239 }
240 return *this;
241 }
operator *=(const simd4_t<T,N> & v)242 inline simd4_t<T,N>& operator*=(const simd4_t<T,N>& v) {
243 for (int i=0; i<N; ++i) {
244 vec[i] *= v[i];
245 }
246 return *this;
247 }
248
operator /=(T s)249 inline simd4_t<T,N>& operator/=(T s) {
250 for (int i=0; i<N; ++i) {
251 vec[i] /= s;
252 }
253 return *this;
254 }
operator /=(const T v[N])255 inline simd4_t<T,N>& operator/=(const T v[N]) {
256 for (int i=0; i<N; ++i) {
257 vec[i] /= v[i];
258 }
259 return *this;
260 }
operator /=(const simd4_t<T,N> & v)261 inline simd4_t<T,N>& operator/=(const simd4_t<T,N>& v) {
262 for (int i=0; i<N; ++i) {
263 vec[i] /= v[i];
264 }
265 return *this;
266 }
267 };
268
269 template<typename T, int N>
operator -(const simd4_t<T,N> & v)270 inline simd4_t<T,N> operator-(const simd4_t<T,N>& v) {
271 simd4_t<T,N> r(T(0));
272 r -= v;
273 return r;
274 }
275
276 template<typename T, int N>
operator +(simd4_t<T,N> v1,const simd4_t<T,N> & v2)277 inline simd4_t<T,N> operator+(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
278 v1 += v2;
279 return v1;
280 }
281
282 template<typename T, int N>
operator -(simd4_t<T,N> v1,const simd4_t<T,N> & v2)283 inline simd4_t<T,N> operator-(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
284 v1 -= v2;
285 return v1;
286 }
287
288 template<typename T, int N>
operator *(simd4_t<T,N> v1,const simd4_t<T,N> & v2)289 inline simd4_t<T,N> operator*(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
290 v1 *= v2;
291 return v1;
292 }
293
294 template<typename T, int N>
operator /(simd4_t<T,N> v1,const simd4_t<T,N> & v2)295 inline simd4_t<T,N> operator/(simd4_t<T,N> v1, const simd4_t<T,N>& v2) {
296 v1 /= v2;
297 return v1;
298 }
299
300 template<typename T, int N>
operator *(T f,simd4_t<T,N> v)301 inline simd4_t<T,N> operator*(T f, simd4_t<T,N> v) {
302 v *= f;
303 return v;
304 }
305
306 template<typename T, int N>
operator *(simd4_t<T,N> v,T f)307 inline simd4_t<T,N> operator*(simd4_t<T,N> v, T f) {
308 v *= f;
309 return v;
310 }
311
312 #ifdef ENABLE_SIMD_CODE
313
314 # ifdef __SSE__
315 namespace simd4
316 {
317 alignas(16) static const uint32_t m2a32[] = {
318 0xffffffff,0xffffffff,0,0
319 };
320 alignas(16) static const uint32_t m3a32[] = {
321 0xffffffff,0xffffffff,0xffffffff,0
322 };
323 alignas(32) static const uint64_t m2a64[] = {
324 0xffffffffffffffff,0xffffffffffffffff,0,0
325 };
326 alignas(32) static const uint64_t m3a64[] = {
327 0xffffffffffffffff,0xffffffffffffffff,0xffffffffffffffff,0
328 };
329 }; /* namespace simd4 */
330
331
332 template<int N>
333 class alignas(16) simd4_t<float,N>
334 {
335 private:
336 typedef float __vec4f_t[N];
337
338 union {
339 __m128 simd4;
340 __vec4f_t vec;
341 };
342
343 public:
simd4_t(void)344 simd4_t(void) { simd4 = _mm_setzero_ps(); }
simd4_t(float f)345 simd4_t(float f) { simd4 = _mm_set1_ps(f); }
simd4_t(float x,float y)346 simd4_t(float x, float y) : simd4_t(x,y,0,0) {}
simd4_t(float x,float y,float z)347 simd4_t(float x, float y, float z) : simd4_t(x,y,z,0) {}
simd4_t(float x,float y,float z,float w)348 simd4_t(float x, float y, float z, float w) { simd4 = _mm_set_ps(w,z,y,x); }
simd4_t(const __vec4f_t v)349 simd4_t(const __vec4f_t v) { simd4 = _mm_loadu_ps(v); }
simd4_t(const simd4_t<float,4> & v)350 simd4_t(const simd4_t<float,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<float,3> & v)351 simd4_t(const simd4_t<float,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<float,2> & v)352 simd4_t(const simd4_t<float,2>& v) { simd4 = v.v4(); }
simd4_t(const __m128 & v)353 simd4_t(const __m128& v) { simd4 = v; }
354
v4(void) const355 inline const __m128 &v4(void) const {
356 return simd4;
357 }
v4(void)358 inline __m128 &v4(void) {
359 return simd4;
360 }
361
__anon3e7c049c0702null362 inline const float (&ptr(void) const)[N] {
363 return vec;
364 }
__anon3e7c049c0802null365 inline float (&ptr(void))[N] {
366 return vec;
367 }
368
369 inline const float& operator[](unsigned n) const {
370 assert(n<N);
371 return vec[n];
372 }
373
operator [](unsigned n)374 inline float& operator[](unsigned n) {
375 assert(n<N);
376 return vec[n];
377 }
378
379 template<int M>
operator =(const simd4_t<float,M> & v)380 inline simd4_t<float,N>& operator=(const simd4_t<float,M>& v) {
381 simd4 = simd4_t<float,N>(v).v4();
382 return *this;
383 }
operator =(const __m128 & v)384 inline simd4_t<float,N>& operator=(const __m128& v) {
385 simd4 = v;
386 return *this;
387 }
388
operator +=(float f)389 inline simd4_t<float,N>& operator+=(float f) {
390 simd4 = _mm_add_ps(simd4, _mm_set1_ps(f));
391 return *this;
392 }
operator +=(const simd4_t<float,N> & v)393 inline simd4_t<float,N>& operator+=(const simd4_t<float,N>& v) {
394 simd4 = _mm_add_ps(simd4, v.v4());
395 return *this;
396 }
397
operator -=(float f)398 inline simd4_t<float,N>& operator-=(float f) {
399 simd4 = _mm_sub_ps(simd4, _mm_set1_ps(f));
400 return *this;
401 }
operator -=(const simd4_t<float,N> & v)402 inline simd4_t<float,N>& operator-=(const simd4_t<float,N>& v) {
403 simd4 = _mm_sub_ps(simd4, v.v4());
404 return *this;
405 }
406
operator *=(float f)407 inline simd4_t<float,N>& operator*=(float f) {
408 simd4 = _mm_mul_ps(simd4, _mm_set1_ps(f));
409 return *this;
410 }
operator *=(const simd4_t<float,N> & v)411 inline simd4_t<float,N>& operator*=(const simd4_t<float,N>& v) {
412 simd4 = _mm_mul_ps(simd4, v.v4());
413 return *this;
414 }
415
operator /=(float f)416 inline simd4_t<float,N>& operator/=(float f) {
417 simd4 = _mm_div_ps(simd4, _mm_set1_ps(f));
418 return *this;
419 }
operator /=(const simd4_t<float,N> & v)420 inline simd4_t<float,N>& operator/=(const simd4_t<float,N>& v) {
421 simd4 = _mm_div_ps(simd4, v.v4());
422 return *this;
423 }
424 };
425
426 static const __m128 fmask2 = _mm_load_ps((const float*)simd4::m2a32);
427 static const __m128 fmask3 = _mm_load_ps((const float*)simd4::m3a32);
428
429 template<>
simd4_t(float x,float y,float z,float w)430 inline simd4_t<float,4>::simd4_t(float x, float y, float z, float w) {
431 simd4 = _mm_set_ps(w,z,y,x);
432 }
433 template<>
simd4_t(float x,float y,float z,float w)434 inline simd4_t<float,3>::simd4_t(float x, float y, float z, float w) {
435 simd4 = _mm_and_ps(_mm_set_ps(w,z,y,x), fmask3);
436 }
437 template<>
simd4_t(float x,float y,float z,float w)438 inline simd4_t<float,2>::simd4_t(float x, float y, float z, float w) {
439 simd4 = _mm_and_ps(_mm_set_ps(w,z,y,x), fmask2);
440 }
441
442 template<>
simd4_t(const simd4_t<float,4> & v)443 inline simd4_t<float,4>::simd4_t(const simd4_t<float,4>& v) {
444 simd4 = v.v4();
445 }
446 template<>
simd4_t(const simd4_t<float,3> & v)447 inline simd4_t<float,4>::simd4_t(const simd4_t<float,3>& v) {
448 simd4 = v.v4();
449 }
450 template<>
simd4_t(const simd4_t<float,2> & v)451 inline simd4_t<float,4>::simd4_t(const simd4_t<float,2>& v) {
452 simd4 = v.v4();
453 }
454 template<>
simd4_t(const simd4_t<float,4> & v)455 inline simd4_t<float,3>::simd4_t(const simd4_t<float,4>& v) {
456 simd4 = _mm_and_ps(v.v4(), fmask3);
457 }
458 template<>
simd4_t(const simd4_t<float,3> & v)459 inline simd4_t<float,3>::simd4_t(const simd4_t<float,3>& v) {
460 simd4 = v.v4();
461 }
462 template<>
simd4_t(const simd4_t<float,2> & v)463 inline simd4_t<float,3>::simd4_t(const simd4_t<float,2>& v) {
464 simd4 = v.v4();
465 }
466 template<>
simd4_t(const simd4_t<float,4> & v)467 inline simd4_t<float,2>::simd4_t(const simd4_t<float,4>& v) {
468 simd4 = _mm_and_ps(v.v4(), fmask2);
469 }
470 template<>
simd4_t(const simd4_t<float,3> & v)471 inline simd4_t<float,2>::simd4_t(const simd4_t<float,3>& v) {
472 simd4 = _mm_and_ps(v.v4(), fmask2);
473 }
474 template<>
simd4_t(const simd4_t<float,2> & v)475 inline simd4_t<float,2>::simd4_t(const simd4_t<float,2>& v) {
476 simd4 = v.v4();
477 }
478
479 template<>
simd4_t(const __vec4f_t v)480 inline simd4_t<float,4>::simd4_t(const __vec4f_t v) {
481 simd4 = _mm_loadu_ps(v);
482 }
483 template<>
simd4_t(const __vec4f_t v)484 inline simd4_t<float,3>::simd4_t(const __vec4f_t v) {
485 simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask3);
486 }
487 template<>
simd4_t(const __vec4f_t v)488 inline simd4_t<float,2>::simd4_t(const __vec4f_t v) {
489 simd4 = _mm_and_ps(_mm_loadu_ps(v), fmask2);
490 }
491 template<>
simd4_t(float f)492 inline simd4_t<float,4>::simd4_t(float f) {
493 simd4 = _mm_set1_ps(f);
494 }
495 template<>
simd4_t(float f)496 inline simd4_t<float,3>::simd4_t(float f) {
497 simd4 = _mm_and_ps(_mm_set1_ps(f), fmask3);
498 }
499
500 template<>
simd4_t(float f)501 inline simd4_t<float,2>::simd4_t(float f) {
502 simd4 = _mm_and_ps(_mm_set1_ps(f), fmask2);
503 }
504
505 namespace simd4
506 {
507 // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
508 # ifdef __SSE3__
hsum_ps_sse(__m128 v)509 inline static float hsum_ps_sse(__m128 v) {
510 __m128 shuf = _mm_movehdup_ps(v);
511 __m128 sums = _mm_add_ps(v, shuf);
512 shuf = _mm_movehl_ps(shuf, sums);
513 sums = _mm_add_ss(sums, shuf);
514 return _mm_cvtss_f32(sums);
515 }
516 # else /* SSE */
517 inline static float hsum_ps_sse(__m128 v) {
518 __m128 shuf = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 3, 0, 1));
519 __m128 sums = _mm_add_ps(v, shuf);
520 shuf = _mm_movehl_ps(shuf, sums);
521 sums = _mm_add_ss(sums, shuf);
522 return _mm_cvtss_f32(sums);
523 }
524 # endif
525
526 template<>
magnitude2(const simd4_t<float,4> & v)527 inline float magnitude2(const simd4_t<float,4>& v) {
528 return hsum_ps_sse(_mm_mul_ps(v.v4(),v.v4()));
529 }
530
531 template<>
dot(const simd4_t<float,4> & v1,const simd4_t<float,4> & v2)532 inline float dot(const simd4_t<float,4>& v1, const simd4_t<float,4>& v2) {
533 return hsum_ps_sse(_mm_mul_ps(v1.v4(),v2.v4()));
534 }
535
536 template<>
cross(const simd4_t<float,3> & v1,const simd4_t<float,3> & v2)537 inline simd4_t<float,3> cross(const simd4_t<float,3>& v1, const simd4_t<float,3>& v2)
538 {
539 // http://threadlocalmutex.com/?p=8
540 __m128 v41 = v1.v4(), v42 = v2.v4();
541 __m128 a = _mm_shuffle_ps(v41, v41, _MM_SHUFFLE(3, 0, 2, 1));
542 __m128 b = _mm_shuffle_ps(v42, v42, _MM_SHUFFLE(3, 0, 2, 1));
543 __m128 c = _mm_sub_ps(_mm_mul_ps(v41, b), _mm_mul_ps(a, v42));
544 return _mm_shuffle_ps(c, c, _MM_SHUFFLE(3, 0, 2, 1));
545 }
546
547 template<int N>
min(simd4_t<float,N> v1,const simd4_t<float,N> & v2)548 inline simd4_t<float,N> min(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
549 v1 = _mm_min_ps(v1.v4(), v2.v4());
550 return v1;
551 }
552
553 template<int N>
max(simd4_t<float,N> v1,const simd4_t<float,N> & v2)554 inline simd4_t<float,N> max(simd4_t<float,N> v1, const simd4_t<float,N>& v2) {
555 v1 = _mm_max_ps(v1.v4(), v2.v4());
556 return v1;
557 }
558
559 template<int N>
abs(simd4_t<float,N> v)560 inline simd4_t<float,N>abs(simd4_t<float,N> v) {
561 static const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
562 v = _mm_andnot_ps(sign_mask, v.v4());
563 return v;
564 }
565
566 } /* namsepace simd4 */
567
568 # endif
569
570
571 # ifdef __AVX_unsupported__
572 template<int N>
573 class alignas(32) simd4_t<double,N>
574 {
575 private:
576 typedef double __vec4d_t[N];
577
578 union {
579 __m256d simd4;
580 __vec4d_t vec;
581 };
582
583 public:
simd4_t(void)584 simd4_t(void) { simd4 = _mm256_setzero_pd(); }
simd4_t(double d)585 simd4_t(double d) { simd4 = _mm256_set1_pd(d); }
simd4_t(double x,double y)586 simd4_t(double x, double y) : simd4_t(x,y,0,0) {}
simd4_t(double x,double y,double z)587 simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {}
simd4_t(double x,double y,double z,double w)588 simd4_t(double x, double y, double z, double w) {
589 simd4 = _mm256_set_pd(w,z,y,x);
590 }
simd4_t(const __vec4d_t v)591 simd4_t(const __vec4d_t v) { simd4 = _mm256_loadu_pd(v); }
simd4_t(const simd4_t<double,4> & v)592 simd4_t(const simd4_t<double,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<double,3> & v)593 simd4_t(const simd4_t<double,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<double,2> & v)594 simd4_t(const simd4_t<double,2>& v) { simd4 = v.v4(); }
simd4_t(const __m256d & v)595 simd4_t(const __m256d& v) { simd4 = v; }
596
v4(void) const597 inline const __m256d (&v4(void) const) {
598 return simd4;
599 }
v4(void)600 inline __m256d (&v4(void)) {
601 return simd4;
602 }
603
__anon3e7c049c0a02null604 inline const double (&ptr(void) const)[N] {
605 return vec;
606 }
__anon3e7c049c0b02null607 inline double (&ptr(void))[N] {
608 return vec;
609 }
610
611 inline const double& operator[](unsigned n) const {
612 assert(n<N);
613 return vec[n];
614 }
615
operator [](unsigned n)616 inline double& operator[](unsigned n) {
617 assert(n<N);
618 return vec[n];
619 }
620
621 template<int M>
operator =(const simd4_t<double,M> & v)622 inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
623 simd4 = simd4_t<double,N>(v).v4();
624 return *this;
625 }
operator =(const __m256d & v)626 inline simd4_t<double,N>& operator=(const __m256d& v) {
627 simd4 = v;
628 return *this;
629 }
630
operator +=(double d)631 inline simd4_t<double,N>& operator+=(double d) {
632 simd4 = _mm256_add_pd(simd4, _mm256_set1_pd(d));
633 return *this;
634 }
operator +=(const simd4_t<double,N> & v)635 inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
636 simd4 = _mm256_add_pd(simd4, v.v4());
637 return *this;
638 }
639
operator -=(double d)640 inline simd4_t<double,N>& operator-=(double d) {
641 simd4 = _mm256_sub_pd(simd4, _mm256_set1_pd(d));
642 return *this;
643 }
operator -=(const simd4_t<double,N> & v)644 inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
645 simd4 = _mm256_sub_pd(simd4, v.v4());
646 return *this;
647 }
648
operator *=(double d)649 inline simd4_t<double,N>& operator*=(double d) {
650 simd4 = _mm256_mul_pd(simd4, _mm256_set1_pd(d));
651 return *this;
652 }
operator *=(const simd4_t<double,N> & v)653 inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
654 simd4 = _mm256_mul_pd(simd4, v.v4());
655 return *this;
656 }
657
operator /=(double d)658 inline simd4_t<double,N>& operator/=(double d) {
659 simd4 = _mm256_div_pd(simd4, _mm256_set1_pd(d));
660 return *this;
661 }
operator /=(const simd4_t<double,N> & v)662 inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
663 simd4 = _mm256_div_pd(simd4, v.v4());
664 return *this;
665 }
666 };
667
668 static const __m256d dmask2 = _mm256_load_pd((const double*)simd4::m2a64);
669 static const __m256d dmask3 = _mm256_load_pd((const double*)simd4::m3a64);
670
671 template<>
simd4_t(double x,double y,double z,double w)672 inline simd4_t<double,4>::simd4_t(double x, double y, double z, double w) {
673 simd4 = _mm256_set_pd(w,z,y,x);
674 }
675 template<>
simd4_t(double x,double y,double z,double w)676 inline simd4_t<double,3>::simd4_t(double x, double y, double z, double w) {
677 simd4 = _mm256_and_pd(_mm256_set_pd(w,z,y,x), dmask3);
678 }
679 template<>
simd4_t(double x,double y,double z,double w)680 inline simd4_t<double,2>::simd4_t(double x, double y, double z, double w) {
681 simd4 = _mm256_and_pd(_mm256_set_pd(w,z,y,x), dmask2);
682 }
683
684 template<>
simd4_t(const simd4_t<double,4> & v)685 inline simd4_t<double,4>::simd4_t(const simd4_t<double,4>& v) {
686 simd4 = v.v4();
687 }
688 template<>
simd4_t(const simd4_t<double,3> & v)689 inline simd4_t<double,4>::simd4_t(const simd4_t<double,3>& v) {
690 simd4 = v.v4();
691 }
692 template<>
simd4_t(const simd4_t<double,2> & v)693 inline simd4_t<double,4>::simd4_t(const simd4_t<double,2>& v) {
694 simd4 = v.v4();
695 }
696 template<>
simd4_t(const simd4_t<double,4> & v)697 inline simd4_t<double,3>::simd4_t(const simd4_t<double,4>& v) {
698 simd4 = _mm256_and_pd(v.v4(), dmask3);
699 }
700 template<>
simd4_t(const simd4_t<double,3> & v)701 inline simd4_t<double,3>::simd4_t(const simd4_t<double,3>& v) {
702 simd4 = v.v4();
703 }
704 template<>
simd4_t(const simd4_t<double,2> & v)705 inline simd4_t<double,3>::simd4_t(const simd4_t<double,2>& v) {
706 simd4 = v.v4();
707 }
708 template<>
simd4_t(const simd4_t<double,4> & v)709 inline simd4_t<double,2>::simd4_t(const simd4_t<double,4>& v) {
710 simd4 = _mm256_and_pd(v.v4(), dmask2);
711 }
712 template<>
simd4_t(const simd4_t<double,3> & v)713 inline simd4_t<double,2>::simd4_t(const simd4_t<double,3>& v) {
714 simd4 = _mm256_and_pd(v.v4(), dmask2);
715 }
716 template<>
simd4_t(const simd4_t<double,2> & v)717 inline simd4_t<double,2>::simd4_t(const simd4_t<double,2>& v) {
718 simd4 = v.v4();
719 }
720
721 template<>
simd4_t(const __vec4d_t v)722 inline simd4_t<double,4>::simd4_t(const __vec4d_t v) {
723 simd4 = _mm256_loadu_pd(v);
724 }
725 template<>
simd4_t(const __vec4d_t v)726 inline simd4_t<double,3>::simd4_t(const __vec4d_t v) {
727 simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask3);
728 }
729 template<>
simd4_t(const __vec4d_t v)730 inline simd4_t<double,2>::simd4_t(const __vec4d_t v) {
731 simd4 = _mm256_and_pd(_mm256_loadu_pd(v), dmask2);
732 }
733 template<>
simd4_t(double d)734 inline simd4_t<double,4>::simd4_t(double d) {
735 simd4 = _mm256_set1_pd(d);
736 }
737 template<>
simd4_t(double d)738 inline simd4_t<double,3>::simd4_t(double d) {
739 simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask3);
740 }
741 template<>
simd4_t(double d)742 inline simd4_t<double,2>::simd4_t(double d) {
743 simd4 = _mm256_and_pd(_mm256_set1_pd(d), dmask2);
744 }
745
746 namespace simd4
747 {
748 // http://berenger.eu/blog/sseavxsimd-horizontal-sum-sum-simd-vector-intrinsic/
hsum_pd_avx(__m256d v)749 inline static double hsum_pd_avx(__m256d v) {
750 const __m128d valupper = _mm256_extractf128_pd(v, 1);
751 const __m128d vallower = _mm256_castpd256_pd128(v);
752 _mm256_zeroupper();
753 const __m128d valval = _mm_add_pd(valupper, vallower);
754 const __m128d sums = _mm_add_pd(_mm_permute_pd(valval,1), valval);
755 return _mm_cvtsd_f64(sums);
756 }
757
758 template<>
magnitude2(const simd4_t<double,4> & v)759 inline double magnitude2(const simd4_t<double,4>& v) {
760 return hsum_pd_avx(_mm256_mul_pd(v.v4(),v.v4()));
761 }
762
763 template<>
dot(const simd4_t<double,4> & v1,const simd4_t<double,4> & v2)764 inline double dot(const simd4_t<double,4>& v1, const simd4_t<double,4>& v2) {
765 return hsum_pd_avx(_mm256_mul_pd(v1.v4(),v2.v4()));
766 }
767
768 # ifdef __AVX2__
769 template<>
cross(const simd4_t<double,3> & v1,const simd4_t<double,3> & v2)770 inline simd4_t<double,3> cross(const simd4_t<double,3>& v1, const simd4_t<double,3>& v2)
771 {
772 // https://gist.github.com/L2Program/219e07581e69110e7942
773 __m256d v41 = v1.v4(), v42 = v2.v4();
774 return _mm256_sub_pd(
775 _mm256_mul_pd(
776 _mm256_permute4x64_pd(v41,_MM_SHUFFLE(3, 0, 2, 1)),
777 _mm256_permute4x64_pd(v42,_MM_SHUFFLE(3, 1, 0, 2))),
778 _mm256_mul_pd(
779 _mm256_permute4x64_pd(v41,_MM_SHUFFLE(3, 1, 0, 2)),
780 _mm256_permute4x64_pd(v42,_MM_SHUFFLE(3, 0, 2, 1))));
781 }
782 # endif
783
784 template<int N>
min(simd4_t<double,N> v1,const simd4_t<double,N> & v2)785 inline simd4_t<double,N> min(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
786 v1 = _mm256_min_pd(v1.v4(), v2.v4());
787 return v1;
788 }
789
790 template<int N>
max(simd4_t<double,N> v1,const simd4_t<double,N> & v2)791 inline simd4_t<double,N> max(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
792 v1 = _mm256_max_pd(v1.v4(), v2.v4());
793 return v1;
794 }
795
796 template<int N>
abs(simd4_t<double,N> v)797 inline simd4_t<double,N>abs(simd4_t<double,N> v) {
798 static const __m256d sign_mask = _mm256_set1_pd(-0.); // -0. = 1 << 63
799 v = _mm256_andnot_pd(sign_mask, v.v4());
800 return v;
801 }
802
803 } /* namespace simd4 */
804
805 # elif defined __SSE2__
806
807 template<int N>
808 class alignas(16) simd4_t<double,N>
809 {
810 private:
811 typedef double __vec4d_t[N];
812
813 union {
814 __m128d simd4[2];
815 __vec4d_t vec;
816 };
817
818 public:
simd4_t(void)819 simd4_t(void) { simd4[0] = simd4[1] = _mm_setzero_pd(); }
simd4_t(double d)820 simd4_t(double d) { simd4[0] = simd4[1] = _mm_set1_pd(d); }
simd4_t(double x,double y)821 simd4_t(double x, double y) : simd4_t(x,y,0,0) {}
simd4_t(double x,double y,double z)822 simd4_t(double x, double y, double z) : simd4_t(x,y,z,0) {}
simd4_t(double x,double y,double z,double w)823 simd4_t(double x, double y, double z, double w) {
824 simd4[0] = _mm_set_pd(y,x); simd4[1] = _mm_set_pd(w,z);
825 }
simd4_t(const __vec4d_t v)826 simd4_t(const __vec4d_t v) {
827 simd4[0] = _mm_loadu_pd(v); simd4[1] = _mm_loadu_pd(v+2);
828 }
simd4_t(const simd4_t<double,4> & v)829 simd4_t(const simd4_t<double,4>& v) {
830 simd4[0] = v.v4()[0]; simd4[1] = v.v4()[1];
831 }
simd4_t(const simd4_t<double,3> & v)832 simd4_t(const simd4_t<double,3>& v) {
833 simd4[0] = v.v4()[0]; simd4[1] = v.v4()[1];
834 }
simd4_t(const simd4_t<double,2> & v)835 simd4_t(const simd4_t<double,2>& v) {
836 simd4[0] = v.v4()[0]; simd4[1] = _mm_setzero_pd();
837 }
simd4_t(const __m128d v[2])838 simd4_t(const __m128d v[2]) {
839 simd4[0] = v[0];
840 simd4[1] = v[1];
841 }
842
__anon3e7c049c0d02null843 inline const __m128d (&v4(void) const)[2] {
844 return simd4;
845 }
__anon3e7c049c0e02null846 inline __m128d (&v4(void))[2] {
847 return simd4;
848 }
849
__anon3e7c049c0f02null850 inline const double (&ptr(void) const)[N] {
851 return vec;
852 }
__anon3e7c049c1002null853 inline double (&ptr(void))[N] {
854 return vec;
855 }
856
857 inline const double& operator[](unsigned n) const {
858 assert(n<N);
859 return vec[n];
860 }
861
operator [](unsigned n)862 inline double& operator[](unsigned n) {
863 assert(n<N);
864 return vec[n];
865 }
866
867 template<int M>
operator =(const simd4_t<double,M> & v)868 inline simd4_t<double,N>& operator=(const simd4_t<double,M>& v) {
869 simd4_t<double,N> n(v);
870 simd4[0] = n.v4()[0];
871 simd4[1] = n.v4()[1];
872 return *this;
873 }
operator =(const __m128d v[2])874 inline simd4_t<double,N>& operator=(const __m128d v[2]) {
875 simd4[0] = v[0];
876 simd4[1] = v[1];
877 return *this;
878 }
879
operator +=(double d)880 inline simd4_t<double,N>& operator+=(double d) {
881 __m128d d4 = _mm_set1_pd(d);
882 simd4[0] = _mm_add_pd(simd4[0], d4);
883 simd4[1] = _mm_add_pd(simd4[1], d4);
884 return *this;
885 }
operator +=(const simd4_t<double,N> & v)886 inline simd4_t<double,N>& operator+=(const simd4_t<double,N>& v) {
887 simd4[0] = _mm_add_pd(simd4[0], v.v4()[0]);
888 simd4[1] = _mm_add_pd(simd4[1], v.v4()[1]);
889 return *this;
890 }
891
operator -=(double d)892 inline simd4_t<double,N>& operator-=(double d) {
893 __m128d d4 = _mm_set1_pd(d);
894 simd4[0] = _mm_sub_pd(simd4[0], d4);
895 simd4[1] = _mm_sub_pd(simd4[1], d4);
896 }
operator -=(const simd4_t<double,N> & v)897 inline simd4_t<double,N>& operator-=(const simd4_t<double,N>& v) {
898 simd4[0] = _mm_sub_pd(simd4[0], v.v4()[0]);
899 simd4[1] = _mm_sub_pd(simd4[1], v.v4()[1]);
900 return *this;
901 }
902
operator *=(double d)903 inline simd4_t<double,N>& operator*=(double d) {
904 __m128d d4 = _mm_set1_pd(d);
905 simd4[0] = _mm_mul_pd(simd4[0], d4);
906 simd4[1] = _mm_mul_pd(simd4[1], d4);
907 return *this;
908 }
operator *=(const simd4_t<double,N> & v)909 inline simd4_t<double,N>& operator*=(const simd4_t<double,N>& v) {
910 simd4[0] = _mm_mul_pd(simd4[0], v.v4()[0]);
911 simd4[1] = _mm_mul_pd(simd4[1], v.v4()[1]);
912 return *this;
913 }
914
operator /=(double d)915 inline simd4_t<double,N>& operator/=(double d) {
916 __m128d d4 = _mm_set1_pd(d);
917 simd4[0] = _mm_div_pd(simd4[0], d4);
918 simd4[1] = _mm_div_pd(simd4[1], d4);
919 }
operator /=(const simd4_t<double,N> & v)920 inline simd4_t<double,N>& operator/=(const simd4_t<double,N>& v) {
921 simd4[0] = _mm_div_pd(simd4[0], v.v4()[0]);
922 simd4[1] = _mm_div_pd(simd4[1], v.v4()[1]);
923 return *this;
924 }
925 };
926
927 static const __m128d dmask3 = _mm_load_pd((const double*)(simd4::m3a64+2));
928
929 template<>
simd4_t(double x,double y,double z,double w)930 inline simd4_t<double,4>::simd4_t(double x, double y, double z, double w) {
931 simd4[0] = _mm_set_pd(y,x);
932 simd4[1] = _mm_set_pd(w,z);
933 }
934 template<>
simd4_t(double x,double y,double z,double w)935 inline simd4_t<double,3>::simd4_t(double x, double y, double z, double w) {
936 simd4[0] = _mm_set_pd(y,x);
937 simd4[1] = _mm_and_pd(_mm_set_pd(w,z), dmask3);
938 }
939 template<>
simd4_t(double x,double y,double z,double w)940 inline simd4_t<double,2>::simd4_t(double x, double y, double z, double w) {
941 simd4[0] = _mm_set_pd(y,x);
942 simd4[1] = _mm_setzero_pd();
943 }
944
945 template<>
simd4_t(const simd4_t<double,4> & v)946 inline simd4_t<double,4>::simd4_t(const simd4_t<double,4>& v) {
947 simd4[0] = v.v4()[0];
948 simd4[1] = v.v4()[1];
949 }
950 template<>
simd4_t(const simd4_t<double,3> & v)951 inline simd4_t<double,4>::simd4_t(const simd4_t<double,3>& v) {
952 simd4[0] = v.v4()[0];
953 simd4[1] = v.v4()[1];
954 }
955 template<>
simd4_t(const simd4_t<double,2> & v)956 inline simd4_t<double,4>::simd4_t(const simd4_t<double,2>& v) {
957 simd4[0] = v.v4()[0];
958 simd4[1] = v.v4()[1];
959 }
960 template<>
simd4_t(const simd4_t<double,4> & v)961 inline simd4_t<double,3>::simd4_t(const simd4_t<double,4>& v) {
962 simd4[0] = v.v4()[0];
963 simd4[1] = _mm_and_pd(v.v4()[1], dmask3);
964 }
965 template<>
simd4_t(const simd4_t<double,3> & v)966 inline simd4_t<double,3>::simd4_t(const simd4_t<double,3>& v) {
967 simd4[0] = v.v4()[0];
968 simd4[1] = v.v4()[1];
969 }
970 template<>
simd4_t(const simd4_t<double,2> & v)971 inline simd4_t<double,3>::simd4_t(const simd4_t<double,2>& v) {
972 simd4[0] = v.v4()[0];
973 simd4[1] = _mm_setzero_pd();
974 }
975 template<>
simd4_t(const simd4_t<double,4> & v)976 inline simd4_t<double,2>::simd4_t(const simd4_t<double,4>& v) {
977 simd4[0] = v.v4()[0];
978 simd4[1] = _mm_setzero_pd();
979 }
980 template<>
simd4_t(const simd4_t<double,3> & v)981 inline simd4_t<double,2>::simd4_t(const simd4_t<double,3>& v) {
982 simd4[0] = v.v4()[0];
983 simd4[1] = _mm_setzero_pd();
984 }
985 template<>
simd4_t(const simd4_t<double,2> & v)986 inline simd4_t<double,2>::simd4_t(const simd4_t<double,2>& v) {
987 simd4[0] = v.v4()[0];
988 simd4[1] = _mm_setzero_pd();
989 }
990
991 template<>
simd4_t(const __vec4d_t v)992 inline simd4_t<double,4>::simd4_t(const __vec4d_t v) {
993 simd4[0] = _mm_loadu_pd(v);
994 simd4[1] = _mm_loadu_pd(v+2);
995 }
996 template<>
simd4_t(const __vec4d_t v)997 inline simd4_t<double,3>::simd4_t(const __vec4d_t v) {
998 simd4[0] = _mm_loadu_pd(v);
999 simd4[1] = _mm_and_pd(_mm_loadu_pd(v+2), dmask3);
1000 }
1001 template<>
simd4_t(const __vec4d_t v)1002 inline simd4_t<double,2>::simd4_t(const __vec4d_t v) {
1003 simd4[0] = _mm_loadu_pd(v);
1004 simd4[1] = _mm_setzero_pd();
1005 }
1006 template<>
simd4_t(double d)1007 inline simd4_t<double,4>::simd4_t(double d) {
1008 simd4[0] = simd4[1] = _mm_set1_pd(d);
1009 }
1010 template<>
simd4_t(double d)1011 inline simd4_t<double,3>::simd4_t(double d) {
1012 simd4[0] = _mm_set1_pd(d);
1013 simd4[1] = _mm_and_pd(simd4[0], dmask3);
1014 }
1015 template<>
simd4_t(double d)1016 inline simd4_t<double,2>::simd4_t(double d) {
1017 simd4[0] = _mm_set1_pd(d);
1018 simd4[1] = _mm_setzero_pd();
1019 }
1020
1021 namespace simd4
1022 {
1023 // http://stackoverflow.com/questions/6996764/fastest-way-to-do-horizontal-float-vector-sum-on-x86
hsum_pd_sse(const __m128d vd[2])1024 inline static double hsum_pd_sse(const __m128d vd[2]) {
1025 __m128 undef = _mm_setzero_ps();
1026 __m128 shuftmp1 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[0]));
1027 __m128 shuftmp2 = _mm_movehl_ps(undef, _mm_castpd_ps(vd[1]));
1028 __m128d shuf1 = _mm_castps_pd(shuftmp1);
1029 __m128d shuf2 = _mm_castps_pd(shuftmp2);
1030 return _mm_cvtsd_f64(_mm_add_sd(vd[0], shuf1)) +
1031 _mm_cvtsd_f64(_mm_add_sd(vd[1], shuf2));
1032 }
1033
1034 template<>
magnitude2(const simd4_t<double,4> & v)1035 inline double magnitude2(const simd4_t<double,4>& v) {
1036 __m128d v2[2];
1037 v2[0] = _mm_mul_pd(v.v4()[0],v.v4()[0]);
1038 v2[1] = _mm_mul_pd(v.v4()[1],v.v4()[1]);
1039 return hsum_pd_sse(v2);
1040 }
1041
1042 template<>
dot(const simd4_t<double,4> & v1,const simd4_t<double,4> & v2)1043 inline double dot(const simd4_t<double,4>& v1, const simd4_t<double,4>& v2) {
1044 __m128d mv[2];
1045 mv[0] = _mm_mul_pd(v1.v4()[0],v2.v4()[0]);
1046 mv[1] = _mm_mul_pd(v1.v4()[1],v2.v4()[1]);
1047 return hsum_pd_sse(mv);
1048 }
1049
1050 template<>
cross(const simd4_t<double,3> & v1,const simd4_t<double,3> & v2)1051 inline simd4_t<double,3> cross(const simd4_t<double,3>& v1, const simd4_t<double,3>& v2)
1052 {
1053 __m128d a[2], b[2], c[2];
1054 simd4_t<double,3> r;
1055
1056 a[0] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(0, 1));
1057 a[1] = _mm_shuffle_pd(v1.v4()[0], v1.v4()[1], _MM_SHUFFLE2(1, 0));
1058
1059 b[0] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(0, 1));
1060 b[1] = _mm_shuffle_pd(v2.v4()[0], v2.v4()[1], _MM_SHUFFLE2(1, 0));
1061
1062 c[0] = _mm_sub_pd(_mm_mul_pd(v1.v4()[0], b[0]), _mm_mul_pd(a[0], v2.v4()[0]));
1063 c[1] = _mm_sub_pd(_mm_mul_pd(v1.v4()[1], b[1]), _mm_mul_pd(a[1], v2.v4()[1]));
1064
1065 r.v4()[0] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(0, 1));
1066 r.v4()[1] = _mm_shuffle_pd(c[0], c[1], _MM_SHUFFLE2(1, 0));
1067
1068 return r;
1069 }
1070
1071 template<int N>
min(simd4_t<double,N> v1,const simd4_t<double,N> & v2)1072 inline simd4_t<double,N> min(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
1073 v1.v4()[0] = _mm_min_pd(v1.v4()[0], v2.v4()[0]);
1074 v1.v4()[1] = _mm_min_pd(v1.v4()[1], v2.v4()[1]);
1075 return v1;
1076 }
1077
1078 template<int N>
max(simd4_t<double,N> v1,const simd4_t<double,N> & v2)1079 inline simd4_t<double,N> max(simd4_t<double,N> v1, const simd4_t<double,N>& v2) {
1080 v1.v4()[0] = _mm_max_pd(v1.v4()[0], v2.v4()[0]);
1081 v1.v4()[1] = _mm_max_pd(v1.v4()[1], v2.v4()[1]);
1082 return v1;
1083 }
1084
1085 template<int N>
abs(simd4_t<double,N> v)1086 inline simd4_t<double,N>abs(simd4_t<double,N> v) {
1087 static const __m128d sign_mask = _mm_set1_pd(-0.); // -0. = 1 << 63
1088 v.v4()[0] = _mm_andnot_pd(sign_mask, v.v4()[0]);
1089 v.v4()[1] = _mm_andnot_pd(sign_mask, v.v4()[1]);
1090 return v;
1091 }
1092
1093 } /* namespace simd4 */
1094
1095 # endif
1096
1097
1098 # ifdef __SSE2__
1099
1100 template<int N>
1101 class alignas(16) simd4_t<int,N>
1102 {
1103 private:
1104 typedef int __vec4i_t[N];
1105
1106 union {
1107 __m128i simd4;
1108 __vec4i_t vec;
1109 };
1110
1111 public:
simd4_t(void)1112 simd4_t(void) { simd4 = _mm_setzero_si128(); }
simd4_t(int i)1113 simd4_t(int i) { simd4 = _mm_set1_epi32(i); }
simd4_t(int x,int y)1114 simd4_t(int x, int y) : simd4_t(x,y,0,0) {}
simd4_t(int x,int y,int z)1115 simd4_t(int x, int y, int z) : simd4_t(x,y,z,0) {}
simd4_t(int x,int y,int z,int w)1116 simd4_t(int x, int y, int z, int w) { simd4 = _mm_set_epi32(w,z,y,x); }
simd4_t(const __vec4i_t v)1117 simd4_t(const __vec4i_t v) { simd4 = _mm_loadu_si128((const __m128i*)v); }
simd4_t(const simd4_t<int,4> & v)1118 simd4_t(const simd4_t<int,4>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<int,3> & v)1119 simd4_t(const simd4_t<int,3>& v) { simd4 = v.v4(); }
simd4_t(const simd4_t<int,2> & v)1120 simd4_t(const simd4_t<int,2>& v) { simd4 = v.v4(); }
simd4_t(const __m128i & v)1121 simd4_t(const __m128i& v) { simd4 = v; }
1122
v4(void)1123 inline __m128i &v4(void) {
1124 return simd4;
1125 }
1126
v4(void) const1127 inline const __m128i &v4(void) const {
1128 return simd4;
1129 }
1130
__anon3e7c049c1202null1131 inline const int (&ptr(void) const)[N] {
1132 return vec;
1133 }
1134
__anon3e7c049c1302null1135 inline int (&ptr(void))[N] {
1136 return vec;
1137 }
1138
1139 inline const int& operator[](unsigned n) const {
1140 assert(n<N);
1141 return vec[n];
1142 }
1143
operator [](unsigned n)1144 inline int& operator[](unsigned n) {
1145 assert(n<N);
1146 return vec[n];
1147 }
1148
1149 template<int M>
operator =(const simd4_t<int,M> & v)1150 inline simd4_t<int,N>& operator=(const simd4_t<int,M>& v) {
1151 simd4 = simd4_t<int,N>(v).v4();
1152 return *this;
1153 }
operator =(const __m128 & v)1154 inline simd4_t<int,N>& operator=(const __m128& v) {
1155 simd4 = v;
1156 return *this;
1157 }
1158
operator +=(int i)1159 inline simd4_t<int,N>& operator+=(int i) {
1160 simd4 = _mm_add_epi32(simd4, _mm_set1_epi32(i));
1161 return *this;
1162 }
operator +=(const simd4_t<int,N> & v)1163 inline simd4_t<int,N>& operator+=(const simd4_t<int,N>& v) {
1164 simd4 = _mm_add_epi32(simd4, v.v4());
1165 return *this;
1166 }
1167
operator -=(int i)1168 inline simd4_t<int,N>& operator-=(int i) {
1169 simd4 = _mm_sub_epi32(simd4, _mm_set1_epi32(i));
1170 return *this;
1171 }
operator -=(const simd4_t<int,N> & v)1172 inline simd4_t<int,N>& operator-=(const simd4_t<int,N>& v) {
1173 simd4 = _mm_sub_epi32(simd4, v.v4());
1174 return *this;
1175 }
1176
operator *=(int i)1177 inline simd4_t<int,N>& operator*=(int i) {
1178 return operator*=(_mm_set1_epi32(i));
1179 }
operator *=(const simd4_t<int,N> & v)1180 inline simd4_t<int,N>& operator*=(const simd4_t<int,N>& v) {
1181 return operator*=(v.v4());
1182 }
1183 // https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
operator *=(const __m128i & v)1184 inline simd4_t<int,N>& operator*=(const __m128i& v) {
1185 #ifdef __SSE4_1__
1186 simd4 = _mm_mullo_epi32(simd4, v);
1187 #else
1188 __m128i tmp1 = _mm_mul_epu32(simd4, v);
1189 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(simd4,4),
1190 _mm_srli_si128(v,4));
1191 simd4 =_mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1,_MM_SHUFFLE (0,0,2,0)),
1192 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE (0,0,2,0)));
1193 #endif
1194 return *this;
1195 }
1196
operator /=(int s)1197 inline simd4_t<int,N>& operator/=(int s) {
1198 return operator/=(simd4_t<int,4>(s));
1199 }
operator /=(const simd4_t<int,N> & v)1200 inline simd4_t<int,N>& operator/=(const simd4_t<int,N>& v) {
1201 for (int i=0; i<N; ++i) {
1202 vec[i] /= v[i];
1203 }
1204 return *this;
1205 }
1206 };
1207
1208 static const __m128i imask2 = _mm_load_si128((__m128i*)simd4::m2a32);
1209 static const __m128i imask3 = _mm_load_si128((__m128i*)simd4::m3a32);
1210
1211 template<>
simd4_t(int x,int y,int z,int w)1212 inline simd4_t<int,4>::simd4_t(int x, int y, int z, int w) {
1213 simd4 = _mm_set_epi32(w,z,y,x);
1214 }
1215 template<>
simd4_t(int x,int y,int z,int w)1216 inline simd4_t<int,3>::simd4_t(int x, int y, int z, int w) {
1217 simd4 = _mm_and_si128(_mm_set_epi32(w,z,y,x), imask3);
1218 }
1219 template<>
simd4_t(int x,int y,int z,int w)1220 inline simd4_t<int,2>::simd4_t(int x, int y, int z, int w) {
1221 simd4 = _mm_and_si128(_mm_set_epi32(w,z,y,x), imask2);
1222 }
1223
1224 template<>
simd4_t(const simd4_t<int,4> & v)1225 inline simd4_t<int,4>::simd4_t(const simd4_t<int,4>& v) {
1226 simd4 = v.v4();
1227 }
1228 template<>
simd4_t(const simd4_t<int,3> & v)1229 inline simd4_t<int,4>::simd4_t(const simd4_t<int,3>& v) {
1230 simd4 = v.v4();
1231 }
1232 template<>
simd4_t(const simd4_t<int,2> & v)1233 inline simd4_t<int,4>::simd4_t(const simd4_t<int,2>& v) {
1234 simd4 = v.v4();
1235 }
1236 template<>
simd4_t(const simd4_t<int,4> & v)1237 inline simd4_t<int,3>::simd4_t(const simd4_t<int,4>& v) {
1238 simd4 = _mm_and_si128(v.v4(), imask3);
1239 }
1240 template<>
simd4_t(const simd4_t<int,3> & v)1241 inline simd4_t<int,3>::simd4_t(const simd4_t<int,3>& v) {
1242 simd4 = v.v4();
1243 }
1244 template<>
simd4_t(const simd4_t<int,2> & v)1245 inline simd4_t<int,3>::simd4_t(const simd4_t<int,2>& v) {
1246 simd4 = v.v4();
1247 }
1248 template<>
simd4_t(const simd4_t<int,4> & v)1249 inline simd4_t<int,2>::simd4_t(const simd4_t<int,4>& v) {
1250 simd4 = _mm_and_si128(v.v4(), imask2);
1251 }
1252 template<>
simd4_t(const simd4_t<int,3> & v)1253 inline simd4_t<int,2>::simd4_t(const simd4_t<int,3>& v) {
1254 simd4 = _mm_and_si128(v.v4(), imask2);
1255 }
1256 template<>
simd4_t(const simd4_t<int,2> & v)1257 inline simd4_t<int,2>::simd4_t(const simd4_t<int,2>& v) {
1258 simd4 = v.v4();
1259 }
1260
1261 template<>
simd4_t(const __vec4i_t v)1262 inline simd4_t<int,4>::simd4_t(const __vec4i_t v) {
1263 simd4 = _mm_loadu_si128((__m128i*)v);
1264 }
1265 template<>
simd4_t(const __vec4i_t v)1266 inline simd4_t<int,3>::simd4_t(const __vec4i_t v) {
1267 simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask3);
1268 }
1269 template<>
simd4_t(const __vec4i_t v)1270 inline simd4_t<int,2>::simd4_t(const __vec4i_t v) {
1271 simd4 = _mm_and_si128(_mm_loadu_si128((__m128i*)v), imask2);
1272 }
1273 template<>
simd4_t(int i)1274 inline simd4_t<int,4>::simd4_t(int i) {
1275 simd4 = _mm_set1_epi32(i);
1276 }
1277 template<>
simd4_t(int i)1278 inline simd4_t<int,3>::simd4_t(int i) {
1279 simd4 = _mm_and_si128(_mm_set1_epi32(i), imask3);
1280 }
1281 template<>
simd4_t(int i)1282 inline simd4_t<int,2>::simd4_t(int i) {
1283 simd4 = _mm_and_si128(_mm_set1_epi32(i), imask2);
1284 }
1285
1286
1287 namespace simd4
1288 {
1289
1290 # ifdef __SSE4_1__
1291 template<int N>
min(simd4_t<int,N> v1,const simd4_t<int,N> & v2)1292 inline simd4_t<int,N> min(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {
1293 v1 = _mm_min_epi32(v1.v4(), v2.v4());
1294 return v1;
1295 }
1296
1297 template<int N>
max(simd4_t<int,N> v1,const simd4_t<int,N> & v2)1298 inline simd4_t<int,N> max(simd4_t<int,N> v1, const simd4_t<int,N>& v2) {
1299 v1 = _mm_max_epi32(v1.v4(), v2.v4());
1300 return v1;
1301 }
1302 # endif /* __SSE4_1__ */
1303
1304 } /* namespace simd4 */
1305
1306 # endif
1307
1308 #endif /* ENABLE_SIMD_CODE */
1309
1310 #endif /* __SIMD_H__ */
1311
1312