1 /**
2 * Copyright 2014-2016 Andreas Schäfer
3 * Copyright 2015 Kurt Kanzenbach
4 *
5 * Distributed under the Boost Software License, Version 1.0. (See accompanying
6 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7 */
8
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_8_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_DOUBLE_8_HPP
11
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) || \
13 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) || \
14 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15
16 #include <emmintrin.h>
17 #include <libflatarray/detail/short_vec_helpers.hpp>
18 #include <libflatarray/config.h>
19
20 #ifdef LIBFLATARRAY_WITH_CPP14
21 #include <initializer_list>
22 #endif
23
24 namespace LibFlatArray {
25
26 template<typename CARGO, int ARITY>
27 class short_vec;
28
29 #ifdef __ICC
30 // disabling this warning as implicit type conversion is exactly our goal here:
31 #pragma warning push
32 #pragma warning (disable: 2304)
33 #endif
34
35 template<>
36 class short_vec<double, 8>
37 {
38 public:
39 static const int ARITY = 8;
40 typedef short_vec<double, 8> mask_type;
41 typedef short_vec_strategy::sse strategy;
42
43 template<typename _CharT, typename _Traits>
44 friend std::basic_ostream<_CharT, _Traits>& operator<<(
45 std::basic_ostream<_CharT, _Traits>& __os,
46 const short_vec<double, 8>& vec);
47
48 inline
short_vec(const double data=0)49 short_vec(const double data = 0) :
50 val1(_mm_set1_pd(data)),
51 val2(_mm_set1_pd(data)),
52 val3(_mm_set1_pd(data)),
53 val4(_mm_set1_pd(data))
54 {}
55
56 inline
short_vec(const double * data)57 short_vec(const double *data)
58 {
59 load(data);
60 }
61
62 inline
short_vec(const __m128d & val1,const __m128d & val2,const __m128d & val3,const __m128d & val4)63 short_vec(const __m128d& val1, const __m128d& val2, const __m128d& val3, const __m128d& val4) :
64 val1(val1),
65 val2(val2),
66 val3(val3),
67 val4(val4)
68 {}
69
70 #ifdef LIBFLATARRAY_WITH_CPP14
71 inline
short_vec(const std::initializer_list<double> & il)72 short_vec(const std::initializer_list<double>& il)
73 {
74 const double *ptr = static_cast<const double *>(&(*il.begin()));
75 load(ptr);
76 }
77 #endif
78
79 inline
any() const80 bool any() const
81 {
82 __m128d buf1 = _mm_or_pd(_mm_or_pd(val1, val2),
83 _mm_or_pd(val3, val4));
84 __m128d buf2 = _mm_shuffle_pd(buf1, buf1, 1);
85
86 return _mm_cvtsd_f64(buf1) || _mm_cvtsd_f64(buf2);
87 }
88
89 inline
get(int i) const90 double get(int i) const
91 {
92 __m128d buf;
93 if (i < 4) {
94 if (i < 2) {
95 buf = val1;
96 } else {
97 buf = val2;
98 }
99 } else {
100 if (i < 6) {
101 buf = val3;
102 } else {
103 buf = val4;
104 }
105 }
106
107 i &= 1;
108
109 if (i == 0) {
110 return _mm_cvtsd_f64(buf);
111 }
112
113 buf = _mm_shuffle_pd(buf, buf, 1);
114 return _mm_cvtsd_f64(buf);
115 }
116
117 inline
operator -=(const short_vec<double,8> & other)118 void operator-=(const short_vec<double, 8>& other)
119 {
120 val1 = _mm_sub_pd(val1, other.val1);
121 val2 = _mm_sub_pd(val2, other.val2);
122 val3 = _mm_sub_pd(val3, other.val3);
123 val4 = _mm_sub_pd(val4, other.val4);
124 }
125
126 inline
operator -(const short_vec<double,8> & other) const127 short_vec<double, 8> operator-(const short_vec<double, 8>& other) const
128 {
129 return short_vec<double, 8>(
130 _mm_sub_pd(val1, other.val1),
131 _mm_sub_pd(val2, other.val2),
132 _mm_sub_pd(val3, other.val3),
133 _mm_sub_pd(val4, other.val4));
134 }
135
136 inline
operator +=(const short_vec<double,8> & other)137 void operator+=(const short_vec<double, 8>& other)
138 {
139 val1 = _mm_add_pd(val1, other.val1);
140 val2 = _mm_add_pd(val2, other.val2);
141 val3 = _mm_add_pd(val3, other.val3);
142 val4 = _mm_add_pd(val4, other.val4);
143 }
144
145 inline
operator +(const short_vec<double,8> & other) const146 short_vec<double, 8> operator+(const short_vec<double, 8>& other) const
147 {
148 return short_vec<double, 8>(
149 _mm_add_pd(val1, other.val1),
150 _mm_add_pd(val2, other.val2),
151 _mm_add_pd(val3, other.val3),
152 _mm_add_pd(val4, other.val4));
153 }
154
155 inline
operator *=(const short_vec<double,8> & other)156 void operator*=(const short_vec<double, 8>& other)
157 {
158 val1 = _mm_mul_pd(val1, other.val1);
159 val2 = _mm_mul_pd(val2, other.val2);
160 val3 = _mm_mul_pd(val3, other.val3);
161 val4 = _mm_mul_pd(val4, other.val4);
162 }
163
164 inline
operator *(const short_vec<double,8> & other) const165 short_vec<double, 8> operator*(const short_vec<double, 8>& other) const
166 {
167 return short_vec<double, 8>(
168 _mm_mul_pd(val1, other.val1),
169 _mm_mul_pd(val2, other.val2),
170 _mm_mul_pd(val3, other.val3),
171 _mm_mul_pd(val4, other.val4));
172 }
173
174 inline
operator /=(const short_vec<double,8> & other)175 void operator/=(const short_vec<double, 8>& other)
176 {
177 val1 = _mm_div_pd(val1, other.val1);
178 val2 = _mm_div_pd(val2, other.val2);
179 val3 = _mm_div_pd(val3, other.val3);
180 val4 = _mm_div_pd(val4, other.val4);
181 }
182
183 inline
operator /(const short_vec<double,8> & other) const184 short_vec<double, 8> operator/(const short_vec<double, 8>& other) const
185 {
186 return short_vec<double, 8>(
187 _mm_div_pd(val1, other.val1),
188 _mm_div_pd(val2, other.val2),
189 _mm_div_pd(val3, other.val3),
190 _mm_div_pd(val4, other.val4));
191 }
192
193 inline
194 short_vec<double, 8> operator/(const sqrt_reference<double, 8>& other) const;
195
196 inline
operator <(const short_vec<double,8> & other) const197 short_vec<double, 8> operator<(const short_vec<double, 8>& other) const
198 {
199 return short_vec<double, 8>(
200 _mm_cmplt_pd(val1, other.val1),
201 _mm_cmplt_pd(val2, other.val2),
202 _mm_cmplt_pd(val3, other.val3),
203 _mm_cmplt_pd(val4, other.val4));
204 }
205
206 inline
operator <=(const short_vec<double,8> & other) const207 short_vec<double, 8> operator<=(const short_vec<double, 8>& other) const
208 {
209 return short_vec<double, 8>(
210 _mm_cmple_pd(val1, other.val1),
211 _mm_cmple_pd(val2, other.val2),
212 _mm_cmple_pd(val3, other.val3),
213 _mm_cmple_pd(val4, other.val4));
214 }
215
216 inline
operator ==(const short_vec<double,8> & other) const217 short_vec<double, 8> operator==(const short_vec<double, 8>& other) const
218 {
219 return short_vec<double, 8>(
220 _mm_cmpeq_pd(val1, other.val1),
221 _mm_cmpeq_pd(val2, other.val2),
222 _mm_cmpeq_pd(val3, other.val3),
223 _mm_cmpeq_pd(val4, other.val4));
224 }
225
226 inline
operator >(const short_vec<double,8> & other) const227 short_vec<double, 8> operator>(const short_vec<double, 8>& other) const
228 {
229 return short_vec<double, 8>(
230 _mm_cmpgt_pd(val1, other.val1),
231 _mm_cmpgt_pd(val2, other.val2),
232 _mm_cmpgt_pd(val3, other.val3),
233 _mm_cmpgt_pd(val4, other.val4));
234 }
235
236 inline
operator >=(const short_vec<double,8> & other) const237 short_vec<double, 8> operator>=(const short_vec<double, 8>& other) const
238 {
239 return short_vec<double, 8>(
240 _mm_cmpge_pd(val1, other.val1),
241 _mm_cmpge_pd(val2, other.val2),
242 _mm_cmpge_pd(val3, other.val3),
243 _mm_cmpge_pd(val4, other.val4));
244 }
245
246 inline
sqrt() const247 short_vec<double, 8> sqrt() const
248 {
249 return short_vec<double, 8>(
250 _mm_sqrt_pd(val1),
251 _mm_sqrt_pd(val2),
252 _mm_sqrt_pd(val3),
253 _mm_sqrt_pd(val4));
254 }
255
256 inline
load(const double * data)257 void load(const double *data)
258 {
259 val1 = _mm_loadu_pd(data + 0);
260 val2 = _mm_loadu_pd(data + 2);
261 val3 = _mm_loadu_pd(data + 4);
262 val4 = _mm_loadu_pd(data + 6);
263 }
264
265 inline
load_aligned(const double * data)266 void load_aligned(const double *data)
267 {
268 SHORTVEC_ASSERT_ALIGNED(data, 16);
269 val1 = _mm_load_pd(data + 0);
270 val2 = _mm_load_pd(data + 2);
271 val3 = _mm_load_pd(data + 4);
272 val4 = _mm_load_pd(data + 6);
273 }
274
275 inline
store(double * data) const276 void store(double *data) const
277 {
278 _mm_storeu_pd(data + 0, val1);
279 _mm_storeu_pd(data + 2, val2);
280 _mm_storeu_pd(data + 4, val3);
281 _mm_storeu_pd(data + 6, val4);
282 }
283
284 inline
store_aligned(double * data) const285 void store_aligned(double *data) const
286 {
287 SHORTVEC_ASSERT_ALIGNED(data, 16);
288 _mm_store_pd(data + 0, val1);
289 _mm_store_pd(data + 2, val2);
290 _mm_store_pd(data + 4, val3);
291 _mm_store_pd(data + 6, val4);
292 }
293
294 inline
store_nt(double * data) const295 void store_nt(double *data) const
296 {
297 SHORTVEC_ASSERT_ALIGNED(data, 16);
298 _mm_stream_pd(data + 0, val1);
299 _mm_stream_pd(data + 2, val2);
300 _mm_stream_pd(data + 4, val3);
301 _mm_stream_pd(data + 6, val4);
302 }
303
304 inline
gather(const double * ptr,const int * offsets)305 void gather(const double *ptr, const int *offsets)
306 {
307 val1 = _mm_loadl_pd(val1, ptr + offsets[0]);
308 val1 = _mm_loadh_pd(val1, ptr + offsets[1]);
309 val2 = _mm_loadl_pd(val2, ptr + offsets[2]);
310 val2 = _mm_loadh_pd(val2, ptr + offsets[3]);
311 val3 = _mm_loadl_pd(val3, ptr + offsets[4]);
312 val3 = _mm_loadh_pd(val3, ptr + offsets[5]);
313 val4 = _mm_loadl_pd(val4, ptr + offsets[6]);
314 val4 = _mm_loadh_pd(val4, ptr + offsets[7]);
315 }
316
317 inline
scatter(double * ptr,const int * offsets) const318 void scatter(double *ptr, const int *offsets) const
319 {
320 _mm_storel_pd(ptr + offsets[0], val1);
321 _mm_storeh_pd(ptr + offsets[1], val1);
322 _mm_storel_pd(ptr + offsets[2], val2);
323 _mm_storeh_pd(ptr + offsets[3], val2);
324 _mm_storel_pd(ptr + offsets[4], val3);
325 _mm_storeh_pd(ptr + offsets[5], val3);
326 _mm_storel_pd(ptr + offsets[6], val4);
327 _mm_storeh_pd(ptr + offsets[7], val4);
328 }
329
330 private:
331 __m128d val1;
332 __m128d val2;
333 __m128d val3;
334 __m128d val4;
335 };
336
337 #ifdef __ICC
338 #pragma warning pop
339 #endif
340
341 inline
operator <<(double * data,const short_vec<double,8> & vec)342 void operator<<(double *data, const short_vec<double, 8>& vec)
343 {
344 vec.store(data);
345 }
346
347 inline
sqrt(const short_vec<double,8> & vec)348 short_vec<double, 8> sqrt(const short_vec<double, 8>& vec)
349 {
350 return vec.sqrt();
351 }
352
353 template<typename _CharT, typename _Traits>
354 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<double,8> & vec)355 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
356 const short_vec<double, 8>& vec)
357 {
358 const double *data1 = reinterpret_cast<const double *>(&vec.val1);
359 const double *data2 = reinterpret_cast<const double *>(&vec.val2);
360 const double *data3 = reinterpret_cast<const double *>(&vec.val3);
361 const double *data4 = reinterpret_cast<const double *>(&vec.val4);
362 __os << "[" << data1[0] << ", " << data1[1] << ", " << data2[0] << ", " << data2[1] << ", " << data3[0] << ", " << data3[1] << ", " << data4[0] << ", " << data4[1] << "]";
363 return __os;
364 }
365
366 }
367
368 #endif
369
370 #endif
371