1 /**
2  * Copyright 2014-2016 Andreas Schäfer
3  * Copyright 2015 Kurt Kanzenbach
4  *
5  * Distributed under the Boost Software License, Version 1.0. (See accompanying
6  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7  */
8 
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_16_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_16_HPP
11 
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) ||             \
13     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) ||            \
14     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15 
16 #include <emmintrin.h>
17 #include <libflatarray/detail/short_vec_helpers.hpp>
18 #include <libflatarray/config.h>
19 
20 #ifdef LIBFLATARRAY_WITH_CPP14
21 #include <initializer_list>
22 #endif
23 
24 namespace LibFlatArray {
25 
26 template<typename CARGO, int ARITY>
27 class short_vec;
28 
29 #ifdef __ICC
30 // disabling this warning as implicit type conversion is exactly our goal here:
31 #pragma warning push
32 #pragma warning (disable: 2304)
33 #endif
34 
35 template<>
36 class short_vec<double, 16>
37 {
38 public:
39     static const int ARITY = 16;
40     typedef short_vec<double, 16> mask_type;
41     typedef short_vec_strategy::sse strategy;
42 
43     template<typename _CharT, typename _Traits>
44     friend std::basic_ostream<_CharT, _Traits>& operator<<(
45         std::basic_ostream<_CharT, _Traits>& __os,
46         const short_vec<double, 16>& vec);
47 
48     inline
short_vec(const double data=0)49     short_vec(const double data = 0) :
50         val1(_mm_set1_pd(data)),
51         val2(_mm_set1_pd(data)),
52         val3(_mm_set1_pd(data)),
53         val4(_mm_set1_pd(data)),
54         val5(_mm_set1_pd(data)),
55         val6(_mm_set1_pd(data)),
56         val7(_mm_set1_pd(data)),
57         val8(_mm_set1_pd(data))
58     {}
59 
60     inline
short_vec(const double * data)61     short_vec(const double *data)
62     {
63         load(data);
64     }
65 
66     inline
short_vec(const __m128d & val1,const __m128d & val2,const __m128d & val3,const __m128d & val4,const __m128d & val5,const __m128d & val6,const __m128d & val7,const __m128d & val8)67     short_vec(
68         const __m128d& val1,
69         const __m128d& val2,
70         const __m128d& val3,
71         const __m128d& val4,
72         const __m128d& val5,
73         const __m128d& val6,
74         const __m128d& val7,
75         const __m128d& val8) :
76         val1(val1),
77         val2(val2),
78         val3(val3),
79         val4(val4),
80         val5(val5),
81         val6(val6),
82         val7(val7),
83         val8(val8)
84     {}
85 
86 #ifdef LIBFLATARRAY_WITH_CPP14
87     inline
short_vec(const std::initializer_list<double> & il)88     short_vec(const std::initializer_list<double>& il)
89     {
90         const double *ptr = static_cast<const double *>(&(*il.begin()));
91         load(ptr);
92     }
93 #endif
94 
95     inline
any() const96     bool any() const
97     {
98         __m128d buf1 = _mm_or_pd(
99             _mm_or_pd(
100                 _mm_or_pd(val1, val2),
101                 _mm_or_pd(val3, val4)),
102             _mm_or_pd(
103                 _mm_or_pd(val5, val6),
104                 _mm_or_pd(val7, val8)));
105         __m128d buf2 = _mm_shuffle_pd(buf1, buf1, 1);
106 
107         return _mm_cvtsd_f64(buf1) || _mm_cvtsd_f64(buf2);
108     }
109 
110     inline
get(int i) const111     double get(int i) const
112     {
113         __m128d buf;
114         if (i < 8) {
115             if (i < 4) {
116                 if (i < 2) {
117                     buf = val1;
118                 } else {
119                     buf = val2;
120                 }
121             } else {
122                 if (i < 6) {
123                     buf = val3;
124                 } else {
125                     buf = val4;
126                 }
127             }
128         } else {
129             if (i < 12) {
130                 if (i < 10) {
131                     buf = val5;
132                 } else {
133                     buf = val6;
134                 }
135             } else {
136                 if (i < 14) {
137                     buf = val7;
138                 } else {
139                     buf = val8;
140                 }
141             }
142         }
143 
144         i &= 1;
145 
146         if (i == 0) {
147             return _mm_cvtsd_f64(buf);
148         }
149 
150         buf = _mm_shuffle_pd(buf, buf, 1);
151         return _mm_cvtsd_f64(buf);
152     }
153 
154     inline
operator -=(const short_vec<double,16> & other)155     void operator-=(const short_vec<double, 16>& other)
156     {
157         val1 = _mm_sub_pd(val1, other.val1);
158         val2 = _mm_sub_pd(val2, other.val2);
159         val3 = _mm_sub_pd(val3, other.val3);
160         val4 = _mm_sub_pd(val4, other.val4);
161         val5 = _mm_sub_pd(val5, other.val5);
162         val6 = _mm_sub_pd(val6, other.val6);
163         val7 = _mm_sub_pd(val7, other.val7);
164         val8 = _mm_sub_pd(val8, other.val8);
165     }
166 
167     inline
operator -(const short_vec<double,16> & other) const168     short_vec<double, 16> operator-(const short_vec<double, 16>& other) const
169     {
170         return short_vec<double, 16>(
171             _mm_sub_pd(val1, other.val1),
172             _mm_sub_pd(val2, other.val2),
173             _mm_sub_pd(val3, other.val3),
174             _mm_sub_pd(val4, other.val4),
175             _mm_sub_pd(val5, other.val5),
176             _mm_sub_pd(val6, other.val6),
177             _mm_sub_pd(val7, other.val7),
178             _mm_sub_pd(val8, other.val8));
179     }
180 
181     inline
operator +=(const short_vec<double,16> & other)182     void operator+=(const short_vec<double, 16>& other)
183     {
184         val1 = _mm_add_pd(val1, other.val1);
185         val2 = _mm_add_pd(val2, other.val2);
186         val3 = _mm_add_pd(val3, other.val3);
187         val4 = _mm_add_pd(val4, other.val4);
188         val5 = _mm_add_pd(val5, other.val5);
189         val6 = _mm_add_pd(val6, other.val6);
190         val7 = _mm_add_pd(val7, other.val7);
191         val8 = _mm_add_pd(val8, other.val8);
192     }
193 
194     inline
operator +(const short_vec<double,16> & other) const195     short_vec<double, 16> operator+(const short_vec<double, 16>& other) const
196     {
197         return short_vec<double, 16>(
198             _mm_add_pd(val1, other.val1),
199             _mm_add_pd(val2, other.val2),
200             _mm_add_pd(val3, other.val3),
201             _mm_add_pd(val4, other.val4),
202             _mm_add_pd(val5, other.val5),
203             _mm_add_pd(val6, other.val6),
204             _mm_add_pd(val7, other.val7),
205             _mm_add_pd(val8, other.val8));
206     }
207 
208     inline
operator *=(const short_vec<double,16> & other)209     void operator*=(const short_vec<double, 16>& other)
210     {
211         val1 = _mm_mul_pd(val1, other.val1);
212         val2 = _mm_mul_pd(val2, other.val2);
213         val3 = _mm_mul_pd(val3, other.val3);
214         val4 = _mm_mul_pd(val4, other.val4);
215         val5 = _mm_mul_pd(val5, other.val5);
216         val6 = _mm_mul_pd(val6, other.val6);
217         val7 = _mm_mul_pd(val7, other.val7);
218         val8 = _mm_mul_pd(val8, other.val8);
219     }
220 
221     inline
operator *(const short_vec<double,16> & other) const222     short_vec<double, 16> operator*(const short_vec<double, 16>& other) const
223     {
224         return short_vec<double, 16>(
225             _mm_mul_pd(val1, other.val1),
226             _mm_mul_pd(val2, other.val2),
227             _mm_mul_pd(val3, other.val3),
228             _mm_mul_pd(val4, other.val4),
229             _mm_mul_pd(val5, other.val5),
230             _mm_mul_pd(val6, other.val6),
231             _mm_mul_pd(val7, other.val7),
232             _mm_mul_pd(val8, other.val8));
233     }
234 
235     inline
operator /=(const short_vec<double,16> & other)236     void operator/=(const short_vec<double, 16>& other)
237     {
238         val1 = _mm_div_pd(val1, other.val1);
239         val2 = _mm_div_pd(val2, other.val2);
240         val3 = _mm_div_pd(val3, other.val3);
241         val4 = _mm_div_pd(val4, other.val4);
242         val5 = _mm_div_pd(val5, other.val5);
243         val6 = _mm_div_pd(val6, other.val6);
244         val7 = _mm_div_pd(val7, other.val7);
245         val8 = _mm_div_pd(val8, other.val8);
246     }
247 
248     inline
operator /(const short_vec<double,16> & other) const249     short_vec<double, 16> operator/(const short_vec<double, 16>& other) const
250     {
251         return short_vec<double, 16>(
252             _mm_div_pd(val1, other.val1),
253             _mm_div_pd(val2, other.val2),
254             _mm_div_pd(val3, other.val3),
255             _mm_div_pd(val4, other.val4),
256             _mm_div_pd(val5, other.val5),
257             _mm_div_pd(val6, other.val6),
258             _mm_div_pd(val7, other.val7),
259             _mm_div_pd(val8, other.val8));
260     }
261 
262     inline
263     short_vec<double, 16> operator/(const sqrt_reference<double, 16>& other) const;
264 
265     inline
operator <(const short_vec<double,16> & other) const266     short_vec<double, 16> operator<(const short_vec<double, 16>& other) const
267     {
268         return short_vec<double, 16>(
269             _mm_cmplt_pd(val1, other.val1),
270             _mm_cmplt_pd(val2, other.val2),
271             _mm_cmplt_pd(val3, other.val3),
272             _mm_cmplt_pd(val4, other.val4),
273             _mm_cmplt_pd(val5, other.val5),
274             _mm_cmplt_pd(val6, other.val6),
275             _mm_cmplt_pd(val7, other.val7),
276             _mm_cmplt_pd(val8, other.val8));
277     }
278 
279     inline
operator <=(const short_vec<double,16> & other) const280     short_vec<double, 16> operator<=(const short_vec<double, 16>& other) const
281     {
282         return short_vec<double, 16>(
283             _mm_cmple_pd(val1, other.val1),
284             _mm_cmple_pd(val2, other.val2),
285             _mm_cmple_pd(val3, other.val3),
286             _mm_cmple_pd(val4, other.val4),
287             _mm_cmple_pd(val5, other.val5),
288             _mm_cmple_pd(val6, other.val6),
289             _mm_cmple_pd(val7, other.val7),
290             _mm_cmple_pd(val8, other.val8));
291     }
292 
293     inline
operator ==(const short_vec<double,16> & other) const294     short_vec<double, 16> operator==(const short_vec<double, 16>& other) const
295     {
296         return short_vec<double, 16>(
297             _mm_cmpeq_pd(val1, other.val1),
298             _mm_cmpeq_pd(val2, other.val2),
299             _mm_cmpeq_pd(val3, other.val3),
300             _mm_cmpeq_pd(val4, other.val4),
301             _mm_cmpeq_pd(val5, other.val5),
302             _mm_cmpeq_pd(val6, other.val6),
303             _mm_cmpeq_pd(val7, other.val7),
304             _mm_cmpeq_pd(val8, other.val8));
305     }
306 
307     inline
operator >(const short_vec<double,16> & other) const308     short_vec<double, 16> operator>(const short_vec<double, 16>& other) const
309     {
310         return short_vec<double, 16>(
311             _mm_cmpgt_pd(val1, other.val1),
312             _mm_cmpgt_pd(val2, other.val2),
313             _mm_cmpgt_pd(val3, other.val3),
314             _mm_cmpgt_pd(val4, other.val4),
315             _mm_cmpgt_pd(val5, other.val5),
316             _mm_cmpgt_pd(val6, other.val6),
317             _mm_cmpgt_pd(val7, other.val7),
318             _mm_cmpgt_pd(val8, other.val8));
319     }
320 
321     inline
operator >=(const short_vec<double,16> & other) const322     short_vec<double, 16> operator>=(const short_vec<double, 16>& other) const
323     {
324         return short_vec<double, 16>(
325             _mm_cmpge_pd(val1, other.val1),
326             _mm_cmpge_pd(val2, other.val2),
327             _mm_cmpge_pd(val3, other.val3),
328             _mm_cmpge_pd(val4, other.val4),
329             _mm_cmpge_pd(val5, other.val5),
330             _mm_cmpge_pd(val6, other.val6),
331             _mm_cmpge_pd(val7, other.val7),
332             _mm_cmpge_pd(val8, other.val8));
333     }
334 
335     inline
sqrt() const336     short_vec<double, 16> sqrt() const
337     {
338         return short_vec<double, 16>(
339             _mm_sqrt_pd(val1),
340             _mm_sqrt_pd(val2),
341             _mm_sqrt_pd(val3),
342             _mm_sqrt_pd(val4),
343             _mm_sqrt_pd(val5),
344             _mm_sqrt_pd(val6),
345             _mm_sqrt_pd(val7),
346             _mm_sqrt_pd(val8));
347     }
348 
349     inline
load(const double * data)350     void load(const double *data)
351     {
352         val1 = _mm_loadu_pd(data + 0);
353         val2 = _mm_loadu_pd(data + 2);
354         val3 = _mm_loadu_pd(data + 4);
355         val4 = _mm_loadu_pd(data + 6);
356         val5 = _mm_loadu_pd(data + 8);
357         val6 = _mm_loadu_pd(data + 10);
358         val7 = _mm_loadu_pd(data + 12);
359         val8 = _mm_loadu_pd(data + 14);
360     }
361 
362     inline
load_aligned(const double * data)363     void load_aligned(const double *data)
364     {
365         SHORTVEC_ASSERT_ALIGNED(data, 16);
366         val1 = _mm_load_pd(data +  0);
367         val2 = _mm_load_pd(data +  2);
368         val3 = _mm_load_pd(data +  4);
369         val4 = _mm_load_pd(data +  6);
370         val5 = _mm_load_pd(data +  8);
371         val6 = _mm_load_pd(data + 10);
372         val7 = _mm_load_pd(data + 12);
373         val8 = _mm_load_pd(data + 14);
374     }
375 
376     inline
store(double * data) const377     void store(double *data) const
378     {
379         _mm_storeu_pd(data +  0, val1);
380         _mm_storeu_pd(data +  2, val2);
381         _mm_storeu_pd(data +  4, val3);
382         _mm_storeu_pd(data +  6, val4);
383         _mm_storeu_pd(data +  8, val5);
384         _mm_storeu_pd(data + 10, val6);
385         _mm_storeu_pd(data + 12, val7);
386         _mm_storeu_pd(data + 14, val8);
387     }
388 
389     inline
store_aligned(double * data) const390     void store_aligned(double *data) const
391     {
392         SHORTVEC_ASSERT_ALIGNED(data, 16);
393         _mm_store_pd(data +  0, val1);
394         _mm_store_pd(data +  2, val2);
395         _mm_store_pd(data +  4, val3);
396         _mm_store_pd(data +  6, val4);
397         _mm_store_pd(data +  8, val5);
398         _mm_store_pd(data + 10, val6);
399         _mm_store_pd(data + 12, val7);
400         _mm_store_pd(data + 14, val8);
401     }
402 
403     inline
store_nt(double * data) const404     void store_nt(double *data) const
405     {
406         SHORTVEC_ASSERT_ALIGNED(data, 16);
407         _mm_stream_pd(data +  0, val1);
408         _mm_stream_pd(data +  2, val2);
409         _mm_stream_pd(data +  4, val3);
410         _mm_stream_pd(data +  6, val4);
411         _mm_stream_pd(data +  8, val5);
412         _mm_stream_pd(data + 10, val6);
413         _mm_stream_pd(data + 12, val7);
414         _mm_stream_pd(data + 14, val8);
415     }
416 
417     inline
gather(const double * ptr,const int * offsets)418     void gather(const double *ptr, const int *offsets)
419     {
420         val1 = _mm_loadl_pd(val1, ptr + offsets[ 0]);
421         val1 = _mm_loadh_pd(val1, ptr + offsets[ 1]);
422         val2 = _mm_loadl_pd(val2, ptr + offsets[ 2]);
423         val2 = _mm_loadh_pd(val2, ptr + offsets[ 3]);
424         val3 = _mm_loadl_pd(val3, ptr + offsets[ 4]);
425         val3 = _mm_loadh_pd(val3, ptr + offsets[ 5]);
426         val4 = _mm_loadl_pd(val4, ptr + offsets[ 6]);
427         val4 = _mm_loadh_pd(val4, ptr + offsets[ 7]);
428         val5 = _mm_loadl_pd(val5, ptr + offsets[ 8]);
429         val5 = _mm_loadh_pd(val5, ptr + offsets[ 9]);
430         val6 = _mm_loadl_pd(val6, ptr + offsets[10]);
431         val6 = _mm_loadh_pd(val6, ptr + offsets[11]);
432         val7 = _mm_loadl_pd(val7, ptr + offsets[12]);
433         val7 = _mm_loadh_pd(val7, ptr + offsets[13]);
434         val8 = _mm_loadl_pd(val8, ptr + offsets[14]);
435         val8 = _mm_loadh_pd(val8, ptr + offsets[15]);
436     }
437 
438     inline
scatter(double * ptr,const int * offsets) const439     void scatter(double *ptr, const int *offsets) const
440     {
441         _mm_storel_pd(ptr + offsets[ 0], val1);
442         _mm_storeh_pd(ptr + offsets[ 1], val1);
443         _mm_storel_pd(ptr + offsets[ 2], val2);
444         _mm_storeh_pd(ptr + offsets[ 3], val2);
445         _mm_storel_pd(ptr + offsets[ 4], val3);
446         _mm_storeh_pd(ptr + offsets[ 5], val3);
447         _mm_storel_pd(ptr + offsets[ 6], val4);
448         _mm_storeh_pd(ptr + offsets[ 7], val4);
449         _mm_storel_pd(ptr + offsets[ 8], val5);
450         _mm_storeh_pd(ptr + offsets[ 9], val5);
451         _mm_storel_pd(ptr + offsets[10], val6);
452         _mm_storeh_pd(ptr + offsets[11], val6);
453         _mm_storel_pd(ptr + offsets[12], val7);
454         _mm_storeh_pd(ptr + offsets[13], val7);
455         _mm_storel_pd(ptr + offsets[14], val8);
456         _mm_storeh_pd(ptr + offsets[15], val8);
457     }
458 
459 private:
460     __m128d val1;
461     __m128d val2;
462     __m128d val3;
463     __m128d val4;
464     __m128d val5;
465     __m128d val6;
466     __m128d val7;
467     __m128d val8;
468 };
469 
470 #ifdef __ICC
471 #pragma warning pop
472 #endif
473 
474 inline
operator <<(double * data,const short_vec<double,16> & vec)475 void operator<<(double *data, const short_vec<double, 16>& vec)
476 {
477     vec.store(data);
478 }
479 
480 inline
sqrt(const short_vec<double,16> & vec)481 short_vec<double, 16> sqrt(const short_vec<double, 16>& vec)
482 {
483     return vec.sqrt();
484 }
485 
486 template<typename _CharT, typename _Traits>
487 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<double,16> & vec)488 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
489            const short_vec<double, 16>& vec)
490 {
491     const double *data1 = reinterpret_cast<const double *>(&vec.val1);
492     const double *data2 = reinterpret_cast<const double *>(&vec.val2);
493     const double *data3 = reinterpret_cast<const double *>(&vec.val3);
494     const double *data4 = reinterpret_cast<const double *>(&vec.val4);
495     const double *data5 = reinterpret_cast<const double *>(&vec.val5);
496     const double *data6 = reinterpret_cast<const double *>(&vec.val6);
497     const double *data7 = reinterpret_cast<const double *>(&vec.val7);
498     const double *data8 = reinterpret_cast<const double *>(&vec.val8);
499     __os << "["
500          << data1[0] << ", " << data1[1] << ", "
501          << data2[0] << ", " << data2[1] << ", "
502          << data3[0] << ", " << data3[1] << ", "
503          << data4[0] << ", " << data4[1] << ", "
504          << data5[0] << ", " << data5[1] << ", "
505          << data6[0] << ", " << data6[1] << ", "
506          << data7[0] << ", " << data7[1] << ", "
507          << data8[0] << ", " << data8[1] << "]";
508     return __os;
509 }
510 
511 }
512 
513 #endif
514 
515 #endif
516