1 /**
2 * Copyright 2014-2016 Andreas Schäfer
3 * Copyright 2015 Kurt Kanzenbach
4 *
5 * Distributed under the Boost Software License, Version 1.0. (See accompanying
6 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7 */
8
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_FLOAT_32_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_FLOAT_32_HPP
11
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) || \
13 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) || \
14 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15
16 #include <emmintrin.h>
17 #include <libflatarray/detail/sqrt_reference.hpp>
18 #include <libflatarray/detail/short_vec_helpers.hpp>
19 #include <libflatarray/config.h>
20
21 #ifdef __SSE4_1__
22 #include <smmintrin.h>
23 #endif
24
25 #ifdef LIBFLATARRAY_WITH_CPP14
26 #include <initializer_list>
27 #endif
28
29 namespace LibFlatArray {
30
31 template<typename CARGO, int ARITY>
32 class short_vec;
33
34 template<typename CARGO, int ARITY>
35 class sqrt_reference;
36
37 #ifdef __ICC
38 // disabling this warning as implicit type conversion is exactly our goal here:
39 #pragma warning push
40 #pragma warning (disable: 2304)
41 #endif
42
43 template<>
44 class short_vec<float, 32>
45 {
46 public:
47 static const int ARITY = 32;
48 typedef short_vec<float, 32> mask_type;
49 typedef short_vec_strategy::sse strategy;
50
51 template<typename _CharT, typename _Traits>
52 friend std::basic_ostream<_CharT, _Traits>& operator<<(
53 std::basic_ostream<_CharT, _Traits>& __os,
54 const short_vec<float, 32>& vec);
55
56 inline
short_vec(const float data=0)57 short_vec(const float data = 0) :
58 val1(_mm_set1_ps(data)),
59 val2(_mm_set1_ps(data)),
60 val3(_mm_set1_ps(data)),
61 val4(_mm_set1_ps(data)),
62 val5(_mm_set1_ps(data)),
63 val6(_mm_set1_ps(data)),
64 val7(_mm_set1_ps(data)),
65 val8(_mm_set1_ps(data))
66 {}
67
68 inline
short_vec(const float * data)69 short_vec(const float *data)
70 {
71 load(data);
72 }
73
74 inline
short_vec(const __m128 & val1,const __m128 & val2,const __m128 & val3,const __m128 & val4,const __m128 & val5,const __m128 & val6,const __m128 & val7,const __m128 & val8)75 short_vec(
76 const __m128& val1,
77 const __m128& val2,
78 const __m128& val3,
79 const __m128& val4,
80 const __m128& val5,
81 const __m128& val6,
82 const __m128& val7,
83 const __m128& val8) :
84 val1(val1),
85 val2(val2),
86 val3(val3),
87 val4(val4),
88 val5(val5),
89 val6(val6),
90 val7(val7),
91 val8(val8)
92 {}
93
94 #ifdef LIBFLATARRAY_WITH_CPP14
95 inline
short_vec(const std::initializer_list<float> & il)96 short_vec(const std::initializer_list<float>& il)
97 {
98 const float *ptr = static_cast<const float *>(&(*il.begin()));
99 load(ptr);
100 }
101 #endif
102
103 inline
104 short_vec(const sqrt_reference<float, 32>& other);
105
106 inline
any() const107 bool any() const
108 {
109 __m128 buf1 = _mm_or_ps(
110 _mm_or_ps(_mm_or_ps(val1, val2),
111 _mm_or_ps(val3, val4)),
112 _mm_or_ps(_mm_or_ps(val5, val6),
113 _mm_or_ps(val7, val8)));
114 __m128 buf2 = _mm_shuffle_ps(buf1, buf1, (3 << 2) | (2 << 0));
115 buf1 = _mm_or_ps(buf1, buf2);
116 buf2 = _mm_shuffle_ps(buf1, buf1, (1 << 0));
117 return _mm_cvtss_f32(buf1) || _mm_cvtss_f32(buf2);
118 }
119
120 inline
get(int i) const121 float get(int i) const
122 {
123 __m128 buf;
124 if (i < 16) {
125 if (i < 8) {
126 if (i < 4) {
127 buf = val1;
128 } else {
129 buf = val2;
130 }
131 } else {
132 if (i < 12) {
133 buf = val3;
134 } else {
135 buf = val4;
136 }
137 }
138 } else {
139 if (i < 24) {
140 if (i < 20) {
141 buf = val5;
142 } else {
143 buf = val6;
144 }
145 } else {
146 if (i < 28) {
147 buf = val7;
148 } else {
149 buf = val8;
150 }
151 }
152 }
153
154 i &= 3;
155
156 if (i == 3) {
157 return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 3));
158 }
159 if (i == 2) {
160 return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 2));
161 }
162 if (i == 1) {
163 return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 1));
164 }
165
166 return _mm_cvtss_f32(buf);
167 }
168
169 inline
operator -=(const short_vec<float,32> & other)170 void operator-=(const short_vec<float, 32>& other)
171 {
172 val1 = _mm_sub_ps(val1, other.val1);
173 val2 = _mm_sub_ps(val2, other.val2);
174 val3 = _mm_sub_ps(val3, other.val3);
175 val4 = _mm_sub_ps(val4, other.val4);
176 val5 = _mm_sub_ps(val5, other.val5);
177 val6 = _mm_sub_ps(val6, other.val6);
178 val7 = _mm_sub_ps(val7, other.val7);
179 val8 = _mm_sub_ps(val8, other.val8);
180 }
181
182 inline
operator -(const short_vec<float,32> & other) const183 short_vec<float, 32> operator-(const short_vec<float, 32>& other) const
184 {
185 return short_vec<float, 32>(
186 _mm_sub_ps(val1, other.val1),
187 _mm_sub_ps(val2, other.val2),
188 _mm_sub_ps(val3, other.val3),
189 _mm_sub_ps(val4, other.val4),
190 _mm_sub_ps(val5, other.val5),
191 _mm_sub_ps(val6, other.val6),
192 _mm_sub_ps(val7, other.val7),
193 _mm_sub_ps(val8, other.val8));
194 }
195
196 inline
operator +=(const short_vec<float,32> & other)197 void operator+=(const short_vec<float, 32>& other)
198 {
199 val1 = _mm_add_ps(val1, other.val1);
200 val2 = _mm_add_ps(val2, other.val2);
201 val3 = _mm_add_ps(val3, other.val3);
202 val4 = _mm_add_ps(val4, other.val4);
203 val5 = _mm_add_ps(val5, other.val5);
204 val6 = _mm_add_ps(val6, other.val6);
205 val7 = _mm_add_ps(val7, other.val7);
206 val8 = _mm_add_ps(val8, other.val8);
207 }
208
209 inline
operator +(const short_vec<float,32> & other) const210 short_vec<float, 32> operator+(const short_vec<float, 32>& other) const
211 {
212 return short_vec<float, 32>(
213 _mm_add_ps(val1, other.val1),
214 _mm_add_ps(val2, other.val2),
215 _mm_add_ps(val3, other.val3),
216 _mm_add_ps(val4, other.val4),
217 _mm_add_ps(val5, other.val5),
218 _mm_add_ps(val6, other.val6),
219 _mm_add_ps(val7, other.val7),
220 _mm_add_ps(val8, other.val8));
221 }
222
223 inline
operator *=(const short_vec<float,32> & other)224 void operator*=(const short_vec<float, 32>& other)
225 {
226 val1 = _mm_mul_ps(val1, other.val1);
227 val2 = _mm_mul_ps(val2, other.val2);
228 val3 = _mm_mul_ps(val3, other.val3);
229 val4 = _mm_mul_ps(val4, other.val4);
230 val5 = _mm_mul_ps(val5, other.val5);
231 val6 = _mm_mul_ps(val6, other.val6);
232 val7 = _mm_mul_ps(val7, other.val7);
233 val8 = _mm_mul_ps(val8, other.val8);
234 }
235
236 inline
operator *(const short_vec<float,32> & other) const237 short_vec<float, 32> operator*(const short_vec<float, 32>& other) const
238 {
239 return short_vec<float, 32>(
240 _mm_mul_ps(val1, other.val1),
241 _mm_mul_ps(val2, other.val2),
242 _mm_mul_ps(val3, other.val3),
243 _mm_mul_ps(val4, other.val4),
244 _mm_mul_ps(val5, other.val5),
245 _mm_mul_ps(val6, other.val6),
246 _mm_mul_ps(val7, other.val7),
247 _mm_mul_ps(val8, other.val8));
248 }
249
250 inline
operator /=(const short_vec<float,32> & other)251 void operator/=(const short_vec<float, 32>& other)
252 {
253 val1 = _mm_div_ps(val1, other.val1);
254 val2 = _mm_div_ps(val2, other.val2);
255 val3 = _mm_div_ps(val3, other.val3);
256 val4 = _mm_div_ps(val4, other.val4);
257 val5 = _mm_div_ps(val5, other.val5);
258 val6 = _mm_div_ps(val6, other.val6);
259 val7 = _mm_div_ps(val7, other.val7);
260 val8 = _mm_div_ps(val8, other.val8);
261 }
262
263 inline
264 void operator/=(const sqrt_reference<float, 32>& other);
265
266 inline
operator /(const short_vec<float,32> & other) const267 short_vec<float, 32> operator/(const short_vec<float, 32>& other) const
268 {
269 return short_vec<float, 32>(
270 _mm_div_ps(val1, other.val1),
271 _mm_div_ps(val2, other.val2),
272 _mm_div_ps(val3, other.val3),
273 _mm_div_ps(val4, other.val4),
274 _mm_div_ps(val5, other.val5),
275 _mm_div_ps(val6, other.val6),
276 _mm_div_ps(val7, other.val7),
277 _mm_div_ps(val8, other.val8));
278 }
279
280 inline
281 short_vec<float, 32> operator/(const sqrt_reference<float, 32>& other) const;
282
283 inline
operator <(const short_vec<float,32> & other) const284 short_vec<float, 32> operator<(const short_vec<float, 32>& other) const
285 {
286 return short_vec<float, 32>(
287 _mm_cmplt_ps(val1, other.val1),
288 _mm_cmplt_ps(val2, other.val2),
289 _mm_cmplt_ps(val3, other.val3),
290 _mm_cmplt_ps(val4, other.val4),
291 _mm_cmplt_ps(val5, other.val5),
292 _mm_cmplt_ps(val6, other.val6),
293 _mm_cmplt_ps(val7, other.val7),
294 _mm_cmplt_ps(val8, other.val8));
295 }
296
297 inline
operator <=(const short_vec<float,32> & other) const298 short_vec<float, 32> operator<=(const short_vec<float, 32>& other) const
299 {
300 return short_vec<float, 32>(
301 _mm_cmple_ps(val1, other.val1),
302 _mm_cmple_ps(val2, other.val2),
303 _mm_cmple_ps(val3, other.val3),
304 _mm_cmple_ps(val4, other.val4),
305 _mm_cmple_ps(val5, other.val5),
306 _mm_cmple_ps(val6, other.val6),
307 _mm_cmple_ps(val7, other.val7),
308 _mm_cmple_ps(val8, other.val8));
309 }
310
311 inline
operator ==(const short_vec<float,32> & other) const312 short_vec<float, 32> operator==(const short_vec<float, 32>& other) const
313 {
314 return short_vec<float, 32>(
315 _mm_cmpeq_ps(val1, other.val1),
316 _mm_cmpeq_ps(val2, other.val2),
317 _mm_cmpeq_ps(val3, other.val3),
318 _mm_cmpeq_ps(val4, other.val4),
319 _mm_cmpeq_ps(val5, other.val5),
320 _mm_cmpeq_ps(val6, other.val6),
321 _mm_cmpeq_ps(val7, other.val7),
322 _mm_cmpeq_ps(val8, other.val8));
323 }
324
325 inline
operator >(const short_vec<float,32> & other) const326 short_vec<float, 32> operator>(const short_vec<float, 32>& other) const
327 {
328 return short_vec<float, 32>(
329 _mm_cmpgt_ps(val1, other.val1),
330 _mm_cmpgt_ps(val2, other.val2),
331 _mm_cmpgt_ps(val3, other.val3),
332 _mm_cmpgt_ps(val4, other.val4),
333 _mm_cmpgt_ps(val5, other.val5),
334 _mm_cmpgt_ps(val6, other.val6),
335 _mm_cmpgt_ps(val7, other.val7),
336 _mm_cmpgt_ps(val8, other.val8));
337 }
338
339 inline
operator >=(const short_vec<float,32> & other) const340 short_vec<float, 32> operator>=(const short_vec<float, 32>& other) const
341 {
342 return short_vec<float, 32>(
343 _mm_cmpge_ps(val1, other.val1),
344 _mm_cmpge_ps(val2, other.val2),
345 _mm_cmpge_ps(val3, other.val3),
346 _mm_cmpge_ps(val4, other.val4),
347 _mm_cmpge_ps(val5, other.val5),
348 _mm_cmpge_ps(val6, other.val6),
349 _mm_cmpge_ps(val7, other.val7),
350 _mm_cmpge_ps(val8, other.val8));
351 }
352
353 inline
sqrt() const354 short_vec<float, 32> sqrt() const
355 {
356 return short_vec<float, 32>(
357 _mm_sqrt_ps(val1),
358 _mm_sqrt_ps(val2),
359 _mm_sqrt_ps(val3),
360 _mm_sqrt_ps(val4),
361 _mm_sqrt_ps(val5),
362 _mm_sqrt_ps(val6),
363 _mm_sqrt_ps(val7),
364 _mm_sqrt_ps(val8));
365 }
366
367 inline
load(const float * data)368 void load(const float *data)
369 {
370 val1 = _mm_loadu_ps(data + 0);
371 val2 = _mm_loadu_ps(data + 4);
372 val3 = _mm_loadu_ps(data + 8);
373 val4 = _mm_loadu_ps(data + 12);
374 val5 = _mm_loadu_ps(data + 16);
375 val6 = _mm_loadu_ps(data + 20);
376 val7 = _mm_loadu_ps(data + 24);
377 val8 = _mm_loadu_ps(data + 28);
378 }
379
380 inline
load_aligned(const float * data)381 void load_aligned(const float *data)
382 {
383 SHORTVEC_ASSERT_ALIGNED(data, 16);
384 val1 = _mm_load_ps(data + 0);
385 val2 = _mm_load_ps(data + 4);
386 val3 = _mm_load_ps(data + 8);
387 val4 = _mm_load_ps(data + 12);
388 val5 = _mm_load_ps(data + 16);
389 val6 = _mm_load_ps(data + 20);
390 val7 = _mm_load_ps(data + 24);
391 val8 = _mm_load_ps(data + 28);
392 }
393
394 inline
store(float * data) const395 void store(float *data) const
396 {
397 _mm_storeu_ps(data + 0, val1);
398 _mm_storeu_ps(data + 4, val2);
399 _mm_storeu_ps(data + 8, val3);
400 _mm_storeu_ps(data + 12, val4);
401 _mm_storeu_ps(data + 16, val5);
402 _mm_storeu_ps(data + 20, val6);
403 _mm_storeu_ps(data + 24, val7);
404 _mm_storeu_ps(data + 28, val8);
405 }
406
407 inline
store_aligned(float * data) const408 void store_aligned(float *data) const
409 {
410 SHORTVEC_ASSERT_ALIGNED(data, 16);
411 _mm_store_ps(data + 0, val1);
412 _mm_store_ps(data + 4, val2);
413 _mm_store_ps(data + 8, val3);
414 _mm_store_ps(data + 12, val4);
415 _mm_store_ps(data + 16, val5);
416 _mm_store_ps(data + 20, val6);
417 _mm_store_ps(data + 24, val7);
418 _mm_store_ps(data + 28, val8);
419 }
420
421 inline
store_nt(float * data) const422 void store_nt(float *data) const
423 {
424 SHORTVEC_ASSERT_ALIGNED(data, 16);
425 _mm_stream_ps(data + 0, val1);
426 _mm_stream_ps(data + 4, val2);
427 _mm_stream_ps(data + 8, val3);
428 _mm_stream_ps(data + 12, val4);
429 _mm_stream_ps(data + 16, val5);
430 _mm_stream_ps(data + 20, val6);
431 _mm_stream_ps(data + 24, val7);
432 _mm_stream_ps(data + 28, val8);
433 }
434
435 #ifdef __SSE4_1__
436 inline
gather(const float * ptr,const int * offsets)437 void gather(const float *ptr, const int *offsets)
438 {
439 val1 = _mm_load_ss(ptr + offsets[0]);
440 SHORTVEC_INSERT_PS(val1, ptr, offsets[ 1], _MM_MK_INSERTPS_NDX(0,1,0));
441 SHORTVEC_INSERT_PS(val1, ptr, offsets[ 2], _MM_MK_INSERTPS_NDX(0,2,0));
442 SHORTVEC_INSERT_PS(val1, ptr, offsets[ 3], _MM_MK_INSERTPS_NDX(0,3,0));
443
444 val2 = _mm_load_ss(ptr + offsets[4]);
445 SHORTVEC_INSERT_PS(val2, ptr, offsets[ 5], _MM_MK_INSERTPS_NDX(0,1,0));
446 SHORTVEC_INSERT_PS(val2, ptr, offsets[ 6], _MM_MK_INSERTPS_NDX(0,2,0));
447 SHORTVEC_INSERT_PS(val2, ptr, offsets[ 7], _MM_MK_INSERTPS_NDX(0,3,0));
448
449 val3 = _mm_load_ss(ptr + offsets[8]);
450 SHORTVEC_INSERT_PS(val3, ptr, offsets[ 9], _MM_MK_INSERTPS_NDX(0,1,0));
451 SHORTVEC_INSERT_PS(val3, ptr, offsets[10], _MM_MK_INSERTPS_NDX(0,2,0));
452 SHORTVEC_INSERT_PS(val3, ptr, offsets[11], _MM_MK_INSERTPS_NDX(0,3,0));
453
454 val4 = _mm_load_ss(ptr + offsets[12]);
455 SHORTVEC_INSERT_PS(val4, ptr, offsets[13], _MM_MK_INSERTPS_NDX(0,1,0));
456 SHORTVEC_INSERT_PS(val4, ptr, offsets[14], _MM_MK_INSERTPS_NDX(0,2,0));
457 SHORTVEC_INSERT_PS(val4, ptr, offsets[15], _MM_MK_INSERTPS_NDX(0,3,0));
458
459 val5 = _mm_load_ss(ptr + offsets[16]);
460 SHORTVEC_INSERT_PS(val5, ptr, offsets[17], _MM_MK_INSERTPS_NDX(0,1,0));
461 SHORTVEC_INSERT_PS(val5, ptr, offsets[18], _MM_MK_INSERTPS_NDX(0,2,0));
462 SHORTVEC_INSERT_PS(val5, ptr, offsets[19], _MM_MK_INSERTPS_NDX(0,3,0));
463
464 val6 = _mm_load_ss(ptr + offsets[20]);
465 SHORTVEC_INSERT_PS(val6, ptr, offsets[21], _MM_MK_INSERTPS_NDX(0,1,0));
466 SHORTVEC_INSERT_PS(val6, ptr, offsets[22], _MM_MK_INSERTPS_NDX(0,2,0));
467 SHORTVEC_INSERT_PS(val6, ptr, offsets[23], _MM_MK_INSERTPS_NDX(0,3,0));
468
469 val7 = _mm_load_ss(ptr + offsets[24]);
470 SHORTVEC_INSERT_PS(val7, ptr, offsets[25], _MM_MK_INSERTPS_NDX(0,1,0));
471 SHORTVEC_INSERT_PS(val7, ptr, offsets[26], _MM_MK_INSERTPS_NDX(0,2,0));
472 SHORTVEC_INSERT_PS(val7, ptr, offsets[27], _MM_MK_INSERTPS_NDX(0,3,0));
473
474 val8 = _mm_load_ss(ptr + offsets[28]);
475 SHORTVEC_INSERT_PS(val8, ptr, offsets[29], _MM_MK_INSERTPS_NDX(0,1,0));
476 SHORTVEC_INSERT_PS(val8, ptr, offsets[30], _MM_MK_INSERTPS_NDX(0,2,0));
477 SHORTVEC_INSERT_PS(val8, ptr, offsets[31], _MM_MK_INSERTPS_NDX(0,3,0));
478 }
479
480 inline
scatter(float * ptr,const int * offsets) const481 void scatter(float *ptr, const int *offsets) const
482 {
483 ShortVecHelpers::ExtractResult r1, r2, r3, r4;
484 r1.i = _mm_extract_ps(val1, 0);
485 r2.i = _mm_extract_ps(val1, 1);
486 r3.i = _mm_extract_ps(val1, 2);
487 r4.i = _mm_extract_ps(val1, 3);
488 ptr[offsets[0]] = r1.f;
489 ptr[offsets[1]] = r2.f;
490 ptr[offsets[2]] = r3.f;
491 ptr[offsets[3]] = r4.f;
492
493 r1.i = _mm_extract_ps(val2, 0);
494 r2.i = _mm_extract_ps(val2, 1);
495 r3.i = _mm_extract_ps(val2, 2);
496 r4.i = _mm_extract_ps(val2, 3);
497 ptr[offsets[4]] = r1.f;
498 ptr[offsets[5]] = r2.f;
499 ptr[offsets[6]] = r3.f;
500 ptr[offsets[7]] = r4.f;
501
502 r1.i = _mm_extract_ps(val3, 0);
503 r2.i = _mm_extract_ps(val3, 1);
504 r3.i = _mm_extract_ps(val3, 2);
505 r4.i = _mm_extract_ps(val3, 3);
506 ptr[offsets[ 8]] = r1.f;
507 ptr[offsets[ 9]] = r2.f;
508 ptr[offsets[10]] = r3.f;
509 ptr[offsets[11]] = r4.f;
510
511 r1.i = _mm_extract_ps(val4, 0);
512 r2.i = _mm_extract_ps(val4, 1);
513 r3.i = _mm_extract_ps(val4, 2);
514 r4.i = _mm_extract_ps(val4, 3);
515 ptr[offsets[12]] = r1.f;
516 ptr[offsets[13]] = r2.f;
517 ptr[offsets[14]] = r3.f;
518 ptr[offsets[15]] = r4.f;
519
520 r1.i = _mm_extract_ps(val5, 0);
521 r2.i = _mm_extract_ps(val5, 1);
522 r3.i = _mm_extract_ps(val5, 2);
523 r4.i = _mm_extract_ps(val5, 3);
524 ptr[offsets[16]] = r1.f;
525 ptr[offsets[17]] = r2.f;
526 ptr[offsets[18]] = r3.f;
527 ptr[offsets[19]] = r4.f;
528
529 r1.i = _mm_extract_ps(val6, 0);
530 r2.i = _mm_extract_ps(val6, 1);
531 r3.i = _mm_extract_ps(val6, 2);
532 r4.i = _mm_extract_ps(val6, 3);
533 ptr[offsets[20]] = r1.f;
534 ptr[offsets[21]] = r2.f;
535 ptr[offsets[22]] = r3.f;
536 ptr[offsets[23]] = r4.f;
537
538 r1.i = _mm_extract_ps(val7, 0);
539 r2.i = _mm_extract_ps(val7, 1);
540 r3.i = _mm_extract_ps(val7, 2);
541 r4.i = _mm_extract_ps(val7, 3);
542 ptr[offsets[24]] = r1.f;
543 ptr[offsets[25]] = r2.f;
544 ptr[offsets[26]] = r3.f;
545 ptr[offsets[27]] = r4.f;
546
547 r1.i = _mm_extract_ps(val8, 0);
548 r2.i = _mm_extract_ps(val8, 1);
549 r3.i = _mm_extract_ps(val8, 2);
550 r4.i = _mm_extract_ps(val8, 3);
551 ptr[offsets[28]] = r1.f;
552 ptr[offsets[29]] = r2.f;
553 ptr[offsets[30]] = r3.f;
554 ptr[offsets[31]] = r4.f;
555 }
556 #else
557 inline
gather(const float * ptr,const int * offsets)558 void gather(const float *ptr, const int *offsets)
559 {
560 __m128 f1, f2, f3, f4;
561 f1 = _mm_load_ss(ptr + offsets[0]);
562 f2 = _mm_load_ss(ptr + offsets[2]);
563 f1 = _mm_unpacklo_ps(f1, f2);
564 f3 = _mm_load_ss(ptr + offsets[1]);
565 f4 = _mm_load_ss(ptr + offsets[3]);
566 f3 = _mm_unpacklo_ps(f3, f4);
567 val1 = _mm_unpacklo_ps(f1, f3);
568
569 f1 = _mm_load_ss(ptr + offsets[4]);
570 f2 = _mm_load_ss(ptr + offsets[6]);
571 f1 = _mm_unpacklo_ps(f1, f2);
572 f3 = _mm_load_ss(ptr + offsets[5]);
573 f4 = _mm_load_ss(ptr + offsets[7]);
574 f3 = _mm_unpacklo_ps(f3, f4);
575 val2 = _mm_unpacklo_ps(f1, f3);
576
577 f1 = _mm_load_ss(ptr + offsets[ 8]);
578 f2 = _mm_load_ss(ptr + offsets[10]);
579 f1 = _mm_unpacklo_ps(f1, f2);
580 f3 = _mm_load_ss(ptr + offsets[ 9]);
581 f4 = _mm_load_ss(ptr + offsets[11]);
582 f3 = _mm_unpacklo_ps(f3, f4);
583 val3 = _mm_unpacklo_ps(f1, f3);
584
585 f1 = _mm_load_ss(ptr + offsets[12]);
586 f2 = _mm_load_ss(ptr + offsets[14]);
587 f1 = _mm_unpacklo_ps(f1, f2);
588 f3 = _mm_load_ss(ptr + offsets[13]);
589 f4 = _mm_load_ss(ptr + offsets[15]);
590 f3 = _mm_unpacklo_ps(f3, f4);
591 val4 = _mm_unpacklo_ps(f1, f3);
592
593 f1 = _mm_load_ss(ptr + offsets[16]);
594 f2 = _mm_load_ss(ptr + offsets[18]);
595 f1 = _mm_unpacklo_ps(f1, f2);
596 f3 = _mm_load_ss(ptr + offsets[17]);
597 f4 = _mm_load_ss(ptr + offsets[19]);
598 f3 = _mm_unpacklo_ps(f3, f4);
599 val5 = _mm_unpacklo_ps(f1, f3);
600
601 f1 = _mm_load_ss(ptr + offsets[20]);
602 f2 = _mm_load_ss(ptr + offsets[22]);
603 f1 = _mm_unpacklo_ps(f1, f2);
604 f3 = _mm_load_ss(ptr + offsets[21]);
605 f4 = _mm_load_ss(ptr + offsets[23]);
606 f3 = _mm_unpacklo_ps(f3, f4);
607 val6 = _mm_unpacklo_ps(f1, f3);
608
609 f1 = _mm_load_ss(ptr + offsets[24]);
610 f2 = _mm_load_ss(ptr + offsets[26]);
611 f1 = _mm_unpacklo_ps(f1, f2);
612 f3 = _mm_load_ss(ptr + offsets[25]);
613 f4 = _mm_load_ss(ptr + offsets[27]);
614 f3 = _mm_unpacklo_ps(f3, f4);
615 val7 = _mm_unpacklo_ps(f1, f3);
616
617 f1 = _mm_load_ss(ptr + offsets[28]);
618 f2 = _mm_load_ss(ptr + offsets[30]);
619 f1 = _mm_unpacklo_ps(f1, f2);
620 f3 = _mm_load_ss(ptr + offsets[29]);
621 f4 = _mm_load_ss(ptr + offsets[31]);
622 f3 = _mm_unpacklo_ps(f3, f4);
623 val8 = _mm_unpacklo_ps(f1, f3);
624
625 }
626
627 inline
scatter(float * ptr,const int * offsets) const628 void scatter(float *ptr, const int *offsets) const
629 {
630 __m128 tmp = val1;
631 _mm_store_ss(ptr + offsets[0], tmp);
632 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
633 _mm_store_ss(ptr + offsets[1], tmp);
634 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
635 _mm_store_ss(ptr + offsets[2], tmp);
636 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
637 _mm_store_ss(ptr + offsets[3], tmp);
638
639 tmp = val2;
640 _mm_store_ss(ptr + offsets[4], tmp);
641 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
642 _mm_store_ss(ptr + offsets[5], tmp);
643 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
644 _mm_store_ss(ptr + offsets[6], tmp);
645 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
646 _mm_store_ss(ptr + offsets[7], tmp);
647
648 tmp = val3;
649 _mm_store_ss(ptr + offsets[8], tmp);
650 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
651 _mm_store_ss(ptr + offsets[9], tmp);
652 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
653 _mm_store_ss(ptr + offsets[10], tmp);
654 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
655 _mm_store_ss(ptr + offsets[11], tmp);
656
657 tmp = val4;
658 _mm_store_ss(ptr + offsets[12], tmp);
659 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
660 _mm_store_ss(ptr + offsets[13], tmp);
661 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
662 _mm_store_ss(ptr + offsets[14], tmp);
663 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
664 _mm_store_ss(ptr + offsets[15], tmp);
665
666 tmp = val5;
667 _mm_store_ss(ptr + offsets[16], tmp);
668 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
669 _mm_store_ss(ptr + offsets[17], tmp);
670 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
671 _mm_store_ss(ptr + offsets[18], tmp);
672 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
673 _mm_store_ss(ptr + offsets[19], tmp);
674
675 tmp = val6;
676 _mm_store_ss(ptr + offsets[20], tmp);
677 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
678 _mm_store_ss(ptr + offsets[21], tmp);
679 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
680 _mm_store_ss(ptr + offsets[22], tmp);
681 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
682 _mm_store_ss(ptr + offsets[23], tmp);
683
684 tmp = val7;
685 _mm_store_ss(ptr + offsets[24], tmp);
686 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
687 _mm_store_ss(ptr + offsets[25], tmp);
688 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
689 _mm_store_ss(ptr + offsets[26], tmp);
690 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
691 _mm_store_ss(ptr + offsets[27], tmp);
692
693 tmp = val8;
694 _mm_store_ss(ptr + offsets[28], tmp);
695 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
696 _mm_store_ss(ptr + offsets[29], tmp);
697 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
698 _mm_store_ss(ptr + offsets[30], tmp);
699 tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
700 _mm_store_ss(ptr + offsets[31], tmp);
701 }
702 #endif
703
704 private:
705 __m128 val1;
706 __m128 val2;
707 __m128 val3;
708 __m128 val4;
709 __m128 val5;
710 __m128 val6;
711 __m128 val7;
712 __m128 val8;
713 };
714
715 inline
operator <<(float * data,const short_vec<float,32> & vec)716 void operator<<(float *data, const short_vec<float, 32>& vec)
717 {
718 vec.store(data);
719 }
720
721 template<>
722 class sqrt_reference<float, 32>
723 {
724 public:
725 template<typename OTHER_CARGO, int OTHER_ARITY>
726 friend class short_vec;
727
sqrt_reference(const short_vec<float,32> & vec)728 sqrt_reference(const short_vec<float, 32>& vec) :
729 vec(vec)
730 {}
731
732 private:
733 short_vec<float, 32> vec;
734 };
735
736 #ifdef __ICC
737 #pragma warning pop
738 #endif
739
740 inline
short_vec(const sqrt_reference<float,32> & other)741 short_vec<float, 32>::short_vec(const sqrt_reference<float, 32>& other) :
742 val1(_mm_sqrt_ps(other.vec.val1)),
743 val2(_mm_sqrt_ps(other.vec.val2)),
744 val3(_mm_sqrt_ps(other.vec.val3)),
745 val4(_mm_sqrt_ps(other.vec.val4)),
746 val5(_mm_sqrt_ps(other.vec.val5)),
747 val6(_mm_sqrt_ps(other.vec.val6)),
748 val7(_mm_sqrt_ps(other.vec.val7)),
749 val8(_mm_sqrt_ps(other.vec.val8))
750 {}
751
752 inline
operator /=(const sqrt_reference<float,32> & other)753 void short_vec<float, 32>::operator/=(const sqrt_reference<float, 32>& other)
754 {
755 val1 = _mm_mul_ps(val1, _mm_rsqrt_ps(other.vec.val1));
756 val2 = _mm_mul_ps(val2, _mm_rsqrt_ps(other.vec.val2));
757 val3 = _mm_mul_ps(val3, _mm_rsqrt_ps(other.vec.val3));
758 val4 = _mm_mul_ps(val4, _mm_rsqrt_ps(other.vec.val4));
759 val5 = _mm_mul_ps(val5, _mm_rsqrt_ps(other.vec.val5));
760 val6 = _mm_mul_ps(val6, _mm_rsqrt_ps(other.vec.val6));
761 val7 = _mm_mul_ps(val7, _mm_rsqrt_ps(other.vec.val7));
762 val8 = _mm_mul_ps(val8, _mm_rsqrt_ps(other.vec.val8));
763 }
764
765 inline
operator /(const sqrt_reference<float,32> & other) const766 short_vec<float, 32> short_vec<float, 32>::operator/(const sqrt_reference<float, 32>& other) const
767 {
768 return short_vec<float, 32>(
769 _mm_mul_ps(val1, _mm_rsqrt_ps(other.vec.val1)),
770 _mm_mul_ps(val2, _mm_rsqrt_ps(other.vec.val2)),
771 _mm_mul_ps(val3, _mm_rsqrt_ps(other.vec.val3)),
772 _mm_mul_ps(val4, _mm_rsqrt_ps(other.vec.val4)),
773 _mm_mul_ps(val5, _mm_rsqrt_ps(other.vec.val5)),
774 _mm_mul_ps(val6, _mm_rsqrt_ps(other.vec.val6)),
775 _mm_mul_ps(val7, _mm_rsqrt_ps(other.vec.val7)),
776 _mm_mul_ps(val8, _mm_rsqrt_ps(other.vec.val8)));
777 }
778
779 inline
sqrt(const short_vec<float,32> & vec)780 sqrt_reference<float, 32> sqrt(const short_vec<float, 32>& vec)
781 {
782 return sqrt_reference<float, 32>(vec);
783 }
784
785 template<typename _CharT, typename _Traits>
786 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<float,32> & vec)787 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
788 const short_vec<float, 32>& vec)
789 {
790 const float *data1 = reinterpret_cast<const float *>(&vec.val1);
791 const float *data2 = reinterpret_cast<const float *>(&vec.val2);
792 const float *data3 = reinterpret_cast<const float *>(&vec.val3);
793 const float *data4 = reinterpret_cast<const float *>(&vec.val4);
794 const float *data5 = reinterpret_cast<const float *>(&vec.val5);
795 const float *data6 = reinterpret_cast<const float *>(&vec.val6);
796 const float *data7 = reinterpret_cast<const float *>(&vec.val7);
797 const float *data8 = reinterpret_cast<const float *>(&vec.val8);
798 __os << "["
799 << data1[0] << ", " << data1[1] << ", " << data1[2] << ", " << data1[3] << ", "
800 << data2[0] << ", " << data2[1] << ", " << data2[2] << ", " << data2[3] << ", "
801 << data3[0] << ", " << data3[1] << ", " << data3[2] << ", " << data3[3] << ", "
802 << data4[0] << ", " << data4[1] << ", " << data4[2] << ", " << data4[3] << ", "
803 << data5[0] << ", " << data5[1] << ", " << data5[2] << ", " << data5[3] << ", "
804 << data6[0] << ", " << data6[1] << ", " << data6[2] << ", " << data6[3] << ", "
805 << data7[0] << ", " << data7[1] << ", " << data7[2] << ", " << data7[3] << ", "
806 << data8[0] << ", " << data8[1] << ", " << data8[2] << ", " << data8[3] << "]";
807 return __os;
808 }
809
810 }
811
812 #endif
813
814 #endif
815