1 /**
2 * Copyright 2014-2016 Andreas Schäfer
3 * Copyright 2015 Kurt Kanzenbach
4 *
5 * Distributed under the Boost Software License, Version 1.0. (See accompanying
6 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7 */
8
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_16_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_16_HPP
11
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) || \
13 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) || \
14 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15
16 #include <emmintrin.h>
17 #include <libflatarray/detail/short_vec_helpers.hpp>
18 #include <libflatarray/config.h>
19
20 #ifdef LIBFLATARRAY_WITH_CPP14
21 #include <initializer_list>
22 #endif
23
24 namespace LibFlatArray {
25
26 template<typename CARGO, int ARITY>
27 class short_vec;
28
29 #ifdef __ICC
30 // disabling this warning as implicit type conversion is exactly our goal here:
31 #pragma warning push
32 #pragma warning (disable: 2304)
33 #endif
34
35 template<>
36 class short_vec<double, 16>
37 {
38 public:
39 static const int ARITY = 16;
40 typedef short_vec<double, 16> mask_type;
41 typedef short_vec_strategy::sse strategy;
42
43 template<typename _CharT, typename _Traits>
44 friend std::basic_ostream<_CharT, _Traits>& operator<<(
45 std::basic_ostream<_CharT, _Traits>& __os,
46 const short_vec<double, 16>& vec);
47
48 inline
short_vec(const double data=0)49 short_vec(const double data = 0) :
50 val1(_mm_set1_pd(data)),
51 val2(_mm_set1_pd(data)),
52 val3(_mm_set1_pd(data)),
53 val4(_mm_set1_pd(data)),
54 val5(_mm_set1_pd(data)),
55 val6(_mm_set1_pd(data)),
56 val7(_mm_set1_pd(data)),
57 val8(_mm_set1_pd(data))
58 {}
59
60 inline
short_vec(const double * data)61 short_vec(const double *data)
62 {
63 load(data);
64 }
65
66 inline
short_vec(const __m128d & val1,const __m128d & val2,const __m128d & val3,const __m128d & val4,const __m128d & val5,const __m128d & val6,const __m128d & val7,const __m128d & val8)67 short_vec(
68 const __m128d& val1,
69 const __m128d& val2,
70 const __m128d& val3,
71 const __m128d& val4,
72 const __m128d& val5,
73 const __m128d& val6,
74 const __m128d& val7,
75 const __m128d& val8) :
76 val1(val1),
77 val2(val2),
78 val3(val3),
79 val4(val4),
80 val5(val5),
81 val6(val6),
82 val7(val7),
83 val8(val8)
84 {}
85
86 #ifdef LIBFLATARRAY_WITH_CPP14
87 inline
short_vec(const std::initializer_list<double> & il)88 short_vec(const std::initializer_list<double>& il)
89 {
90 const double *ptr = static_cast<const double *>(&(*il.begin()));
91 load(ptr);
92 }
93 #endif
94
95 inline
any() const96 bool any() const
97 {
98 __m128d buf1 = _mm_or_pd(
99 _mm_or_pd(
100 _mm_or_pd(val1, val2),
101 _mm_or_pd(val3, val4)),
102 _mm_or_pd(
103 _mm_or_pd(val5, val6),
104 _mm_or_pd(val7, val8)));
105 __m128d buf2 = _mm_shuffle_pd(buf1, buf1, 1);
106
107 return _mm_cvtsd_f64(buf1) || _mm_cvtsd_f64(buf2);
108 }
109
110 inline
get(int i) const111 double get(int i) const
112 {
113 __m128d buf;
114 if (i < 8) {
115 if (i < 4) {
116 if (i < 2) {
117 buf = val1;
118 } else {
119 buf = val2;
120 }
121 } else {
122 if (i < 6) {
123 buf = val3;
124 } else {
125 buf = val4;
126 }
127 }
128 } else {
129 if (i < 12) {
130 if (i < 10) {
131 buf = val5;
132 } else {
133 buf = val6;
134 }
135 } else {
136 if (i < 14) {
137 buf = val7;
138 } else {
139 buf = val8;
140 }
141 }
142 }
143
144 i &= 1;
145
146 if (i == 0) {
147 return _mm_cvtsd_f64(buf);
148 }
149
150 buf = _mm_shuffle_pd(buf, buf, 1);
151 return _mm_cvtsd_f64(buf);
152 }
153
154 inline
operator -=(const short_vec<double,16> & other)155 void operator-=(const short_vec<double, 16>& other)
156 {
157 val1 = _mm_sub_pd(val1, other.val1);
158 val2 = _mm_sub_pd(val2, other.val2);
159 val3 = _mm_sub_pd(val3, other.val3);
160 val4 = _mm_sub_pd(val4, other.val4);
161 val5 = _mm_sub_pd(val5, other.val5);
162 val6 = _mm_sub_pd(val6, other.val6);
163 val7 = _mm_sub_pd(val7, other.val7);
164 val8 = _mm_sub_pd(val8, other.val8);
165 }
166
167 inline
operator -(const short_vec<double,16> & other) const168 short_vec<double, 16> operator-(const short_vec<double, 16>& other) const
169 {
170 return short_vec<double, 16>(
171 _mm_sub_pd(val1, other.val1),
172 _mm_sub_pd(val2, other.val2),
173 _mm_sub_pd(val3, other.val3),
174 _mm_sub_pd(val4, other.val4),
175 _mm_sub_pd(val5, other.val5),
176 _mm_sub_pd(val6, other.val6),
177 _mm_sub_pd(val7, other.val7),
178 _mm_sub_pd(val8, other.val8));
179 }
180
181 inline
operator +=(const short_vec<double,16> & other)182 void operator+=(const short_vec<double, 16>& other)
183 {
184 val1 = _mm_add_pd(val1, other.val1);
185 val2 = _mm_add_pd(val2, other.val2);
186 val3 = _mm_add_pd(val3, other.val3);
187 val4 = _mm_add_pd(val4, other.val4);
188 val5 = _mm_add_pd(val5, other.val5);
189 val6 = _mm_add_pd(val6, other.val6);
190 val7 = _mm_add_pd(val7, other.val7);
191 val8 = _mm_add_pd(val8, other.val8);
192 }
193
194 inline
operator +(const short_vec<double,16> & other) const195 short_vec<double, 16> operator+(const short_vec<double, 16>& other) const
196 {
197 return short_vec<double, 16>(
198 _mm_add_pd(val1, other.val1),
199 _mm_add_pd(val2, other.val2),
200 _mm_add_pd(val3, other.val3),
201 _mm_add_pd(val4, other.val4),
202 _mm_add_pd(val5, other.val5),
203 _mm_add_pd(val6, other.val6),
204 _mm_add_pd(val7, other.val7),
205 _mm_add_pd(val8, other.val8));
206 }
207
208 inline
operator *=(const short_vec<double,16> & other)209 void operator*=(const short_vec<double, 16>& other)
210 {
211 val1 = _mm_mul_pd(val1, other.val1);
212 val2 = _mm_mul_pd(val2, other.val2);
213 val3 = _mm_mul_pd(val3, other.val3);
214 val4 = _mm_mul_pd(val4, other.val4);
215 val5 = _mm_mul_pd(val5, other.val5);
216 val6 = _mm_mul_pd(val6, other.val6);
217 val7 = _mm_mul_pd(val7, other.val7);
218 val8 = _mm_mul_pd(val8, other.val8);
219 }
220
221 inline
operator *(const short_vec<double,16> & other) const222 short_vec<double, 16> operator*(const short_vec<double, 16>& other) const
223 {
224 return short_vec<double, 16>(
225 _mm_mul_pd(val1, other.val1),
226 _mm_mul_pd(val2, other.val2),
227 _mm_mul_pd(val3, other.val3),
228 _mm_mul_pd(val4, other.val4),
229 _mm_mul_pd(val5, other.val5),
230 _mm_mul_pd(val6, other.val6),
231 _mm_mul_pd(val7, other.val7),
232 _mm_mul_pd(val8, other.val8));
233 }
234
235 inline
operator /=(const short_vec<double,16> & other)236 void operator/=(const short_vec<double, 16>& other)
237 {
238 val1 = _mm_div_pd(val1, other.val1);
239 val2 = _mm_div_pd(val2, other.val2);
240 val3 = _mm_div_pd(val3, other.val3);
241 val4 = _mm_div_pd(val4, other.val4);
242 val5 = _mm_div_pd(val5, other.val5);
243 val6 = _mm_div_pd(val6, other.val6);
244 val7 = _mm_div_pd(val7, other.val7);
245 val8 = _mm_div_pd(val8, other.val8);
246 }
247
248 inline
operator /(const short_vec<double,16> & other) const249 short_vec<double, 16> operator/(const short_vec<double, 16>& other) const
250 {
251 return short_vec<double, 16>(
252 _mm_div_pd(val1, other.val1),
253 _mm_div_pd(val2, other.val2),
254 _mm_div_pd(val3, other.val3),
255 _mm_div_pd(val4, other.val4),
256 _mm_div_pd(val5, other.val5),
257 _mm_div_pd(val6, other.val6),
258 _mm_div_pd(val7, other.val7),
259 _mm_div_pd(val8, other.val8));
260 }
261
262 inline
263 short_vec<double, 16> operator/(const sqrt_reference<double, 16>& other) const;
264
265 inline
operator <(const short_vec<double,16> & other) const266 short_vec<double, 16> operator<(const short_vec<double, 16>& other) const
267 {
268 return short_vec<double, 16>(
269 _mm_cmplt_pd(val1, other.val1),
270 _mm_cmplt_pd(val2, other.val2),
271 _mm_cmplt_pd(val3, other.val3),
272 _mm_cmplt_pd(val4, other.val4),
273 _mm_cmplt_pd(val5, other.val5),
274 _mm_cmplt_pd(val6, other.val6),
275 _mm_cmplt_pd(val7, other.val7),
276 _mm_cmplt_pd(val8, other.val8));
277 }
278
279 inline
operator <=(const short_vec<double,16> & other) const280 short_vec<double, 16> operator<=(const short_vec<double, 16>& other) const
281 {
282 return short_vec<double, 16>(
283 _mm_cmple_pd(val1, other.val1),
284 _mm_cmple_pd(val2, other.val2),
285 _mm_cmple_pd(val3, other.val3),
286 _mm_cmple_pd(val4, other.val4),
287 _mm_cmple_pd(val5, other.val5),
288 _mm_cmple_pd(val6, other.val6),
289 _mm_cmple_pd(val7, other.val7),
290 _mm_cmple_pd(val8, other.val8));
291 }
292
293 inline
operator ==(const short_vec<double,16> & other) const294 short_vec<double, 16> operator==(const short_vec<double, 16>& other) const
295 {
296 return short_vec<double, 16>(
297 _mm_cmpeq_pd(val1, other.val1),
298 _mm_cmpeq_pd(val2, other.val2),
299 _mm_cmpeq_pd(val3, other.val3),
300 _mm_cmpeq_pd(val4, other.val4),
301 _mm_cmpeq_pd(val5, other.val5),
302 _mm_cmpeq_pd(val6, other.val6),
303 _mm_cmpeq_pd(val7, other.val7),
304 _mm_cmpeq_pd(val8, other.val8));
305 }
306
307 inline
operator >(const short_vec<double,16> & other) const308 short_vec<double, 16> operator>(const short_vec<double, 16>& other) const
309 {
310 return short_vec<double, 16>(
311 _mm_cmpgt_pd(val1, other.val1),
312 _mm_cmpgt_pd(val2, other.val2),
313 _mm_cmpgt_pd(val3, other.val3),
314 _mm_cmpgt_pd(val4, other.val4),
315 _mm_cmpgt_pd(val5, other.val5),
316 _mm_cmpgt_pd(val6, other.val6),
317 _mm_cmpgt_pd(val7, other.val7),
318 _mm_cmpgt_pd(val8, other.val8));
319 }
320
321 inline
operator >=(const short_vec<double,16> & other) const322 short_vec<double, 16> operator>=(const short_vec<double, 16>& other) const
323 {
324 return short_vec<double, 16>(
325 _mm_cmpge_pd(val1, other.val1),
326 _mm_cmpge_pd(val2, other.val2),
327 _mm_cmpge_pd(val3, other.val3),
328 _mm_cmpge_pd(val4, other.val4),
329 _mm_cmpge_pd(val5, other.val5),
330 _mm_cmpge_pd(val6, other.val6),
331 _mm_cmpge_pd(val7, other.val7),
332 _mm_cmpge_pd(val8, other.val8));
333 }
334
335 inline
sqrt() const336 short_vec<double, 16> sqrt() const
337 {
338 return short_vec<double, 16>(
339 _mm_sqrt_pd(val1),
340 _mm_sqrt_pd(val2),
341 _mm_sqrt_pd(val3),
342 _mm_sqrt_pd(val4),
343 _mm_sqrt_pd(val5),
344 _mm_sqrt_pd(val6),
345 _mm_sqrt_pd(val7),
346 _mm_sqrt_pd(val8));
347 }
348
349 inline
load(const double * data)350 void load(const double *data)
351 {
352 val1 = _mm_loadu_pd(data + 0);
353 val2 = _mm_loadu_pd(data + 2);
354 val3 = _mm_loadu_pd(data + 4);
355 val4 = _mm_loadu_pd(data + 6);
356 val5 = _mm_loadu_pd(data + 8);
357 val6 = _mm_loadu_pd(data + 10);
358 val7 = _mm_loadu_pd(data + 12);
359 val8 = _mm_loadu_pd(data + 14);
360 }
361
362 inline
load_aligned(const double * data)363 void load_aligned(const double *data)
364 {
365 SHORTVEC_ASSERT_ALIGNED(data, 16);
366 val1 = _mm_load_pd(data + 0);
367 val2 = _mm_load_pd(data + 2);
368 val3 = _mm_load_pd(data + 4);
369 val4 = _mm_load_pd(data + 6);
370 val5 = _mm_load_pd(data + 8);
371 val6 = _mm_load_pd(data + 10);
372 val7 = _mm_load_pd(data + 12);
373 val8 = _mm_load_pd(data + 14);
374 }
375
376 inline
store(double * data) const377 void store(double *data) const
378 {
379 _mm_storeu_pd(data + 0, val1);
380 _mm_storeu_pd(data + 2, val2);
381 _mm_storeu_pd(data + 4, val3);
382 _mm_storeu_pd(data + 6, val4);
383 _mm_storeu_pd(data + 8, val5);
384 _mm_storeu_pd(data + 10, val6);
385 _mm_storeu_pd(data + 12, val7);
386 _mm_storeu_pd(data + 14, val8);
387 }
388
389 inline
store_aligned(double * data) const390 void store_aligned(double *data) const
391 {
392 SHORTVEC_ASSERT_ALIGNED(data, 16);
393 _mm_store_pd(data + 0, val1);
394 _mm_store_pd(data + 2, val2);
395 _mm_store_pd(data + 4, val3);
396 _mm_store_pd(data + 6, val4);
397 _mm_store_pd(data + 8, val5);
398 _mm_store_pd(data + 10, val6);
399 _mm_store_pd(data + 12, val7);
400 _mm_store_pd(data + 14, val8);
401 }
402
403 inline
store_nt(double * data) const404 void store_nt(double *data) const
405 {
406 SHORTVEC_ASSERT_ALIGNED(data, 16);
407 _mm_stream_pd(data + 0, val1);
408 _mm_stream_pd(data + 2, val2);
409 _mm_stream_pd(data + 4, val3);
410 _mm_stream_pd(data + 6, val4);
411 _mm_stream_pd(data + 8, val5);
412 _mm_stream_pd(data + 10, val6);
413 _mm_stream_pd(data + 12, val7);
414 _mm_stream_pd(data + 14, val8);
415 }
416
417 inline
gather(const double * ptr,const int * offsets)418 void gather(const double *ptr, const int *offsets)
419 {
420 val1 = _mm_loadl_pd(val1, ptr + offsets[ 0]);
421 val1 = _mm_loadh_pd(val1, ptr + offsets[ 1]);
422 val2 = _mm_loadl_pd(val2, ptr + offsets[ 2]);
423 val2 = _mm_loadh_pd(val2, ptr + offsets[ 3]);
424 val3 = _mm_loadl_pd(val3, ptr + offsets[ 4]);
425 val3 = _mm_loadh_pd(val3, ptr + offsets[ 5]);
426 val4 = _mm_loadl_pd(val4, ptr + offsets[ 6]);
427 val4 = _mm_loadh_pd(val4, ptr + offsets[ 7]);
428 val5 = _mm_loadl_pd(val5, ptr + offsets[ 8]);
429 val5 = _mm_loadh_pd(val5, ptr + offsets[ 9]);
430 val6 = _mm_loadl_pd(val6, ptr + offsets[10]);
431 val6 = _mm_loadh_pd(val6, ptr + offsets[11]);
432 val7 = _mm_loadl_pd(val7, ptr + offsets[12]);
433 val7 = _mm_loadh_pd(val7, ptr + offsets[13]);
434 val8 = _mm_loadl_pd(val8, ptr + offsets[14]);
435 val8 = _mm_loadh_pd(val8, ptr + offsets[15]);
436 }
437
438 inline
scatter(double * ptr,const int * offsets) const439 void scatter(double *ptr, const int *offsets) const
440 {
441 _mm_storel_pd(ptr + offsets[ 0], val1);
442 _mm_storeh_pd(ptr + offsets[ 1], val1);
443 _mm_storel_pd(ptr + offsets[ 2], val2);
444 _mm_storeh_pd(ptr + offsets[ 3], val2);
445 _mm_storel_pd(ptr + offsets[ 4], val3);
446 _mm_storeh_pd(ptr + offsets[ 5], val3);
447 _mm_storel_pd(ptr + offsets[ 6], val4);
448 _mm_storeh_pd(ptr + offsets[ 7], val4);
449 _mm_storel_pd(ptr + offsets[ 8], val5);
450 _mm_storeh_pd(ptr + offsets[ 9], val5);
451 _mm_storel_pd(ptr + offsets[10], val6);
452 _mm_storeh_pd(ptr + offsets[11], val6);
453 _mm_storel_pd(ptr + offsets[12], val7);
454 _mm_storeh_pd(ptr + offsets[13], val7);
455 _mm_storel_pd(ptr + offsets[14], val8);
456 _mm_storeh_pd(ptr + offsets[15], val8);
457 }
458
459 private:
460 __m128d val1;
461 __m128d val2;
462 __m128d val3;
463 __m128d val4;
464 __m128d val5;
465 __m128d val6;
466 __m128d val7;
467 __m128d val8;
468 };
469
470 #ifdef __ICC
471 #pragma warning pop
472 #endif
473
474 inline
operator <<(double * data,const short_vec<double,16> & vec)475 void operator<<(double *data, const short_vec<double, 16>& vec)
476 {
477 vec.store(data);
478 }
479
480 inline
sqrt(const short_vec<double,16> & vec)481 short_vec<double, 16> sqrt(const short_vec<double, 16>& vec)
482 {
483 return vec.sqrt();
484 }
485
486 template<typename _CharT, typename _Traits>
487 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<double,16> & vec)488 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
489 const short_vec<double, 16>& vec)
490 {
491 const double *data1 = reinterpret_cast<const double *>(&vec.val1);
492 const double *data2 = reinterpret_cast<const double *>(&vec.val2);
493 const double *data3 = reinterpret_cast<const double *>(&vec.val3);
494 const double *data4 = reinterpret_cast<const double *>(&vec.val4);
495 const double *data5 = reinterpret_cast<const double *>(&vec.val5);
496 const double *data6 = reinterpret_cast<const double *>(&vec.val6);
497 const double *data7 = reinterpret_cast<const double *>(&vec.val7);
498 const double *data8 = reinterpret_cast<const double *>(&vec.val8);
499 __os << "["
500 << data1[0] << ", " << data1[1] << ", "
501 << data2[0] << ", " << data2[1] << ", "
502 << data3[0] << ", " << data3[1] << ", "
503 << data4[0] << ", " << data4[1] << ", "
504 << data5[0] << ", " << data5[1] << ", "
505 << data6[0] << ", " << data6[1] << ", "
506 << data7[0] << ", " << data7[1] << ", "
507 << data8[0] << ", " << data8[1] << "]";
508 return __os;
509 }
510
511 }
512
513 #endif
514
515 #endif
516