1 /**
2  * Copyright 2014-2016 Andreas Schäfer
3  * Copyright 2015 Kurt Kanzenbach
4  *
5  * Distributed under the Boost Software License, Version 1.0. (See accompanying
6  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7  */
8 
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_8_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_8_HPP
11 
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) ||             \
13     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) ||            \
14     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15 
16 #include <emmintrin.h>
17 #include <libflatarray/detail/short_vec_helpers.hpp>
18 #include <libflatarray/config.h>
19 
20 #ifdef LIBFLATARRAY_WITH_CPP14
21 #include <initializer_list>
22 #endif
23 
24 namespace LibFlatArray {
25 
26 template<typename CARGO, int ARITY>
27 class short_vec;
28 
29 #ifdef __ICC
30 // disabling this warning as implicit type conversion is exactly our goal here:
31 #pragma warning push
32 #pragma warning (disable: 2304)
33 #endif
34 
35 template<>
36 class short_vec<double, 8>
37 {
38 public:
39     static const int ARITY = 8;
40     typedef short_vec<double, 8> mask_type;
41     typedef short_vec_strategy::sse strategy;
42 
43     template<typename _CharT, typename _Traits>
44     friend std::basic_ostream<_CharT, _Traits>& operator<<(
45         std::basic_ostream<_CharT, _Traits>& __os,
46         const short_vec<double, 8>& vec);
47 
48     inline
short_vec(const double data=0)49     short_vec(const double data = 0) :
50         val1(_mm_set1_pd(data)),
51         val2(_mm_set1_pd(data)),
52         val3(_mm_set1_pd(data)),
53         val4(_mm_set1_pd(data))
54     {}
55 
56     inline
short_vec(const double * data)57     short_vec(const double *data)
58     {
59         load(data);
60     }
61 
62     inline
short_vec(const __m128d & val1,const __m128d & val2,const __m128d & val3,const __m128d & val4)63     short_vec(const __m128d& val1, const __m128d& val2, const __m128d& val3, const __m128d& val4) :
64         val1(val1),
65         val2(val2),
66         val3(val3),
67         val4(val4)
68     {}
69 
70 #ifdef LIBFLATARRAY_WITH_CPP14
71     inline
short_vec(const std::initializer_list<double> & il)72     short_vec(const std::initializer_list<double>& il)
73     {
74         const double *ptr = static_cast<const double *>(&(*il.begin()));
75         load(ptr);
76     }
77 #endif
78 
79     inline
any() const80     bool any() const
81     {
82         __m128d buf1 = _mm_or_pd(_mm_or_pd(val1, val2),
83                                 _mm_or_pd(val3, val4));
84         __m128d buf2 = _mm_shuffle_pd(buf1, buf1, 1);
85 
86         return _mm_cvtsd_f64(buf1) || _mm_cvtsd_f64(buf2);
87     }
88 
89     inline
get(int i) const90     double get(int i) const
91     {
92         __m128d buf;
93         if (i < 4) {
94             if (i < 2) {
95                 buf = val1;
96             } else {
97                 buf = val2;
98             }
99         } else {
100             if (i < 6) {
101                 buf = val3;
102             } else {
103                 buf = val4;
104             }
105         }
106 
107         i &= 1;
108 
109         if (i == 0) {
110             return _mm_cvtsd_f64(buf);
111         }
112 
113         buf = _mm_shuffle_pd(buf, buf, 1);
114         return _mm_cvtsd_f64(buf);
115     }
116 
117     inline
operator -=(const short_vec<double,8> & other)118     void operator-=(const short_vec<double, 8>& other)
119     {
120         val1 = _mm_sub_pd(val1, other.val1);
121         val2 = _mm_sub_pd(val2, other.val2);
122         val3 = _mm_sub_pd(val3, other.val3);
123         val4 = _mm_sub_pd(val4, other.val4);
124     }
125 
126     inline
operator -(const short_vec<double,8> & other) const127     short_vec<double, 8> operator-(const short_vec<double, 8>& other) const
128     {
129         return short_vec<double, 8>(
130             _mm_sub_pd(val1, other.val1),
131             _mm_sub_pd(val2, other.val2),
132             _mm_sub_pd(val3, other.val3),
133             _mm_sub_pd(val4, other.val4));
134     }
135 
136     inline
operator +=(const short_vec<double,8> & other)137     void operator+=(const short_vec<double, 8>& other)
138     {
139         val1 = _mm_add_pd(val1, other.val1);
140         val2 = _mm_add_pd(val2, other.val2);
141         val3 = _mm_add_pd(val3, other.val3);
142         val4 = _mm_add_pd(val4, other.val4);
143     }
144 
145     inline
operator +(const short_vec<double,8> & other) const146     short_vec<double, 8> operator+(const short_vec<double, 8>& other) const
147     {
148         return short_vec<double, 8>(
149             _mm_add_pd(val1, other.val1),
150             _mm_add_pd(val2, other.val2),
151             _mm_add_pd(val3, other.val3),
152             _mm_add_pd(val4, other.val4));
153     }
154 
155     inline
operator *=(const short_vec<double,8> & other)156     void operator*=(const short_vec<double, 8>& other)
157     {
158         val1 = _mm_mul_pd(val1, other.val1);
159         val2 = _mm_mul_pd(val2, other.val2);
160         val3 = _mm_mul_pd(val3, other.val3);
161         val4 = _mm_mul_pd(val4, other.val4);
162     }
163 
164     inline
operator *(const short_vec<double,8> & other) const165     short_vec<double, 8> operator*(const short_vec<double, 8>& other) const
166     {
167         return short_vec<double, 8>(
168             _mm_mul_pd(val1, other.val1),
169             _mm_mul_pd(val2, other.val2),
170             _mm_mul_pd(val3, other.val3),
171             _mm_mul_pd(val4, other.val4));
172     }
173 
174     inline
operator /=(const short_vec<double,8> & other)175     void operator/=(const short_vec<double, 8>& other)
176     {
177         val1 = _mm_div_pd(val1, other.val1);
178         val2 = _mm_div_pd(val2, other.val2);
179         val3 = _mm_div_pd(val3, other.val3);
180         val4 = _mm_div_pd(val4, other.val4);
181     }
182 
183     inline
operator /(const short_vec<double,8> & other) const184     short_vec<double, 8> operator/(const short_vec<double, 8>& other) const
185     {
186         return short_vec<double, 8>(
187             _mm_div_pd(val1, other.val1),
188             _mm_div_pd(val2, other.val2),
189             _mm_div_pd(val3, other.val3),
190             _mm_div_pd(val4, other.val4));
191     }
192 
193     inline
194     short_vec<double, 8> operator/(const sqrt_reference<double, 8>& other) const;
195 
196     inline
operator <(const short_vec<double,8> & other) const197     short_vec<double, 8> operator<(const short_vec<double, 8>& other) const
198     {
199         return short_vec<double, 8>(
200             _mm_cmplt_pd(val1, other.val1),
201             _mm_cmplt_pd(val2, other.val2),
202             _mm_cmplt_pd(val3, other.val3),
203             _mm_cmplt_pd(val4, other.val4));
204     }
205 
206     inline
operator <=(const short_vec<double,8> & other) const207     short_vec<double, 8> operator<=(const short_vec<double, 8>& other) const
208     {
209         return short_vec<double, 8>(
210             _mm_cmple_pd(val1, other.val1),
211             _mm_cmple_pd(val2, other.val2),
212             _mm_cmple_pd(val3, other.val3),
213             _mm_cmple_pd(val4, other.val4));
214     }
215 
216     inline
operator ==(const short_vec<double,8> & other) const217     short_vec<double, 8> operator==(const short_vec<double, 8>& other) const
218     {
219         return short_vec<double, 8>(
220             _mm_cmpeq_pd(val1, other.val1),
221             _mm_cmpeq_pd(val2, other.val2),
222             _mm_cmpeq_pd(val3, other.val3),
223             _mm_cmpeq_pd(val4, other.val4));
224     }
225 
226     inline
operator >(const short_vec<double,8> & other) const227     short_vec<double, 8> operator>(const short_vec<double, 8>& other) const
228     {
229         return short_vec<double, 8>(
230             _mm_cmpgt_pd(val1, other.val1),
231             _mm_cmpgt_pd(val2, other.val2),
232             _mm_cmpgt_pd(val3, other.val3),
233             _mm_cmpgt_pd(val4, other.val4));
234     }
235 
236     inline
operator >=(const short_vec<double,8> & other) const237     short_vec<double, 8> operator>=(const short_vec<double, 8>& other) const
238     {
239         return short_vec<double, 8>(
240             _mm_cmpge_pd(val1, other.val1),
241             _mm_cmpge_pd(val2, other.val2),
242             _mm_cmpge_pd(val3, other.val3),
243             _mm_cmpge_pd(val4, other.val4));
244     }
245 
246     inline
sqrt() const247     short_vec<double, 8> sqrt() const
248     {
249         return short_vec<double, 8>(
250             _mm_sqrt_pd(val1),
251             _mm_sqrt_pd(val2),
252             _mm_sqrt_pd(val3),
253             _mm_sqrt_pd(val4));
254     }
255 
256     inline
load(const double * data)257     void load(const double *data)
258     {
259         val1 = _mm_loadu_pd(data + 0);
260         val2 = _mm_loadu_pd(data + 2);
261         val3 = _mm_loadu_pd(data + 4);
262         val4 = _mm_loadu_pd(data + 6);
263     }
264 
265     inline
load_aligned(const double * data)266     void load_aligned(const double *data)
267     {
268         SHORTVEC_ASSERT_ALIGNED(data, 16);
269         val1 = _mm_load_pd(data + 0);
270         val2 = _mm_load_pd(data + 2);
271         val3 = _mm_load_pd(data + 4);
272         val4 = _mm_load_pd(data + 6);
273     }
274 
275     inline
store(double * data) const276     void store(double *data) const
277     {
278         _mm_storeu_pd(data + 0, val1);
279         _mm_storeu_pd(data + 2, val2);
280         _mm_storeu_pd(data + 4, val3);
281         _mm_storeu_pd(data + 6, val4);
282     }
283 
284     inline
store_aligned(double * data) const285     void store_aligned(double *data) const
286     {
287         SHORTVEC_ASSERT_ALIGNED(data, 16);
288         _mm_store_pd(data + 0, val1);
289         _mm_store_pd(data + 2, val2);
290         _mm_store_pd(data + 4, val3);
291         _mm_store_pd(data + 6, val4);
292     }
293 
294     inline
store_nt(double * data) const295     void store_nt(double *data) const
296     {
297         SHORTVEC_ASSERT_ALIGNED(data, 16);
298         _mm_stream_pd(data + 0, val1);
299         _mm_stream_pd(data + 2, val2);
300         _mm_stream_pd(data + 4, val3);
301         _mm_stream_pd(data + 6, val4);
302     }
303 
304     inline
gather(const double * ptr,const int * offsets)305     void gather(const double *ptr, const int *offsets)
306     {
307         val1 = _mm_loadl_pd(val1, ptr + offsets[0]);
308         val1 = _mm_loadh_pd(val1, ptr + offsets[1]);
309         val2 = _mm_loadl_pd(val2, ptr + offsets[2]);
310         val2 = _mm_loadh_pd(val2, ptr + offsets[3]);
311         val3 = _mm_loadl_pd(val3, ptr + offsets[4]);
312         val3 = _mm_loadh_pd(val3, ptr + offsets[5]);
313         val4 = _mm_loadl_pd(val4, ptr + offsets[6]);
314         val4 = _mm_loadh_pd(val4, ptr + offsets[7]);
315     }
316 
317     inline
scatter(double * ptr,const int * offsets) const318     void scatter(double *ptr, const int *offsets) const
319     {
320         _mm_storel_pd(ptr + offsets[0], val1);
321         _mm_storeh_pd(ptr + offsets[1], val1);
322         _mm_storel_pd(ptr + offsets[2], val2);
323         _mm_storeh_pd(ptr + offsets[3], val2);
324         _mm_storel_pd(ptr + offsets[4], val3);
325         _mm_storeh_pd(ptr + offsets[5], val3);
326         _mm_storel_pd(ptr + offsets[6], val4);
327         _mm_storeh_pd(ptr + offsets[7], val4);
328     }
329 
330 private:
331     __m128d val1;
332     __m128d val2;
333     __m128d val3;
334     __m128d val4;
335 };
336 
337 #ifdef __ICC
338 #pragma warning pop
339 #endif
340 
341 inline
operator <<(double * data,const short_vec<double,8> & vec)342 void operator<<(double *data, const short_vec<double, 8>& vec)
343 {
344     vec.store(data);
345 }
346 
347 inline
sqrt(const short_vec<double,8> & vec)348 short_vec<double, 8> sqrt(const short_vec<double, 8>& vec)
349 {
350     return vec.sqrt();
351 }
352 
353 template<typename _CharT, typename _Traits>
354 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<double,8> & vec)355 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
356            const short_vec<double, 8>& vec)
357 {
358     const double *data1 = reinterpret_cast<const double *>(&vec.val1);
359     const double *data2 = reinterpret_cast<const double *>(&vec.val2);
360     const double *data3 = reinterpret_cast<const double *>(&vec.val3);
361     const double *data4 = reinterpret_cast<const double *>(&vec.val4);
362     __os << "[" << data1[0] << ", " << data1[1]  << ", " << data2[0]  << ", " << data2[1]  << ", " << data3[0]  << ", " << data3[1]  << ", " << data4[0]  << ", " << data4[1] << "]";
363     return __os;
364 }
365 
366 }
367 
368 #endif
369 
370 #endif
371