1 /**
2  * Copyright 2014-2016 Andreas Schäfer
3  * Copyright 2015 Kurt Kanzenbach
4  *
5  * Distributed under the Boost Software License, Version 1.0. (See accompanying
6  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7  */
8 
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_FLOAT_32_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_FLOAT_32_HPP
11 
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) ||             \
13     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) ||            \
14     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1)
15 
16 #include <emmintrin.h>
17 #include <libflatarray/detail/sqrt_reference.hpp>
18 #include <libflatarray/detail/short_vec_helpers.hpp>
19 #include <libflatarray/config.h>
20 
21 #ifdef __SSE4_1__
22 #include <smmintrin.h>
23 #endif
24 
25 #ifdef LIBFLATARRAY_WITH_CPP14
26 #include <initializer_list>
27 #endif
28 
29 namespace LibFlatArray {
30 
31 template<typename CARGO, int ARITY>
32 class short_vec;
33 
34 template<typename CARGO, int ARITY>
35 class sqrt_reference;
36 
37 #ifdef __ICC
38 // disabling this warning as implicit type conversion is exactly our goal here:
39 #pragma warning push
40 #pragma warning (disable: 2304)
41 #endif
42 
43 template<>
44 class short_vec<float, 32>
45 {
46 public:
47     static const int ARITY = 32;
48     typedef short_vec<float, 32> mask_type;
49     typedef short_vec_strategy::sse strategy;
50 
51     template<typename _CharT, typename _Traits>
52     friend std::basic_ostream<_CharT, _Traits>& operator<<(
53         std::basic_ostream<_CharT, _Traits>& __os,
54         const short_vec<float, 32>& vec);
55 
56     inline
short_vec(const float data=0)57     short_vec(const float data = 0) :
58         val1(_mm_set1_ps(data)),
59         val2(_mm_set1_ps(data)),
60         val3(_mm_set1_ps(data)),
61         val4(_mm_set1_ps(data)),
62         val5(_mm_set1_ps(data)),
63         val6(_mm_set1_ps(data)),
64         val7(_mm_set1_ps(data)),
65         val8(_mm_set1_ps(data))
66     {}
67 
68     inline
short_vec(const float * data)69     short_vec(const float *data)
70     {
71         load(data);
72     }
73 
74     inline
short_vec(const __m128 & val1,const __m128 & val2,const __m128 & val3,const __m128 & val4,const __m128 & val5,const __m128 & val6,const __m128 & val7,const __m128 & val8)75     short_vec(
76         const __m128& val1,
77         const __m128& val2,
78         const __m128& val3,
79         const __m128& val4,
80         const __m128& val5,
81         const __m128& val6,
82         const __m128& val7,
83         const __m128& val8) :
84         val1(val1),
85         val2(val2),
86         val3(val3),
87         val4(val4),
88         val5(val5),
89         val6(val6),
90         val7(val7),
91         val8(val8)
92     {}
93 
94 #ifdef LIBFLATARRAY_WITH_CPP14
95     inline
short_vec(const std::initializer_list<float> & il)96     short_vec(const std::initializer_list<float>& il)
97     {
98         const float *ptr = static_cast<const float *>(&(*il.begin()));
99         load(ptr);
100     }
101 #endif
102 
103     inline
104     short_vec(const sqrt_reference<float, 32>& other);
105 
106     inline
any() const107     bool any() const
108     {
109         __m128 buf1 = _mm_or_ps(
110             _mm_or_ps(_mm_or_ps(val1, val2),
111                       _mm_or_ps(val3, val4)),
112             _mm_or_ps(_mm_or_ps(val5, val6),
113                       _mm_or_ps(val7, val8)));
114         __m128 buf2 = _mm_shuffle_ps(buf1, buf1, (3 << 2) | (2 << 0));
115         buf1 = _mm_or_ps(buf1, buf2);
116         buf2 = _mm_shuffle_ps(buf1, buf1, (1 << 0));
117         return _mm_cvtss_f32(buf1) || _mm_cvtss_f32(buf2);
118     }
119 
120     inline
get(int i) const121     float get(int i) const
122     {
123         __m128 buf;
124         if (i < 16) {
125             if (i < 8) {
126                 if (i < 4) {
127                     buf = val1;
128                 } else {
129                     buf = val2;
130                 }
131             } else {
132                 if (i < 12) {
133                     buf = val3;
134                 } else {
135                     buf = val4;
136                 }
137             }
138         } else {
139             if (i < 24) {
140                 if (i < 20) {
141                     buf = val5;
142                 } else {
143                     buf = val6;
144                 }
145             } else {
146                 if (i < 28) {
147                     buf = val7;
148                 } else {
149                     buf = val8;
150                 }
151             }
152         }
153 
154         i &= 3;
155 
156         if (i == 3) {
157             return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 3));
158         }
159         if (i == 2) {
160             return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 2));
161         }
162         if (i == 1) {
163             return _mm_cvtss_f32(_mm_shuffle_ps(buf, buf, 1));
164         }
165 
166         return _mm_cvtss_f32(buf);
167     }
168 
169     inline
operator -=(const short_vec<float,32> & other)170     void operator-=(const short_vec<float, 32>& other)
171     {
172         val1 = _mm_sub_ps(val1, other.val1);
173         val2 = _mm_sub_ps(val2, other.val2);
174         val3 = _mm_sub_ps(val3, other.val3);
175         val4 = _mm_sub_ps(val4, other.val4);
176         val5 = _mm_sub_ps(val5, other.val5);
177         val6 = _mm_sub_ps(val6, other.val6);
178         val7 = _mm_sub_ps(val7, other.val7);
179         val8 = _mm_sub_ps(val8, other.val8);
180     }
181 
182     inline
operator -(const short_vec<float,32> & other) const183     short_vec<float, 32> operator-(const short_vec<float, 32>& other) const
184     {
185         return short_vec<float, 32>(
186             _mm_sub_ps(val1, other.val1),
187             _mm_sub_ps(val2, other.val2),
188             _mm_sub_ps(val3, other.val3),
189             _mm_sub_ps(val4, other.val4),
190             _mm_sub_ps(val5, other.val5),
191             _mm_sub_ps(val6, other.val6),
192             _mm_sub_ps(val7, other.val7),
193             _mm_sub_ps(val8, other.val8));
194     }
195 
196     inline
operator +=(const short_vec<float,32> & other)197     void operator+=(const short_vec<float, 32>& other)
198     {
199         val1 = _mm_add_ps(val1, other.val1);
200         val2 = _mm_add_ps(val2, other.val2);
201         val3 = _mm_add_ps(val3, other.val3);
202         val4 = _mm_add_ps(val4, other.val4);
203         val5 = _mm_add_ps(val5, other.val5);
204         val6 = _mm_add_ps(val6, other.val6);
205         val7 = _mm_add_ps(val7, other.val7);
206         val8 = _mm_add_ps(val8, other.val8);
207     }
208 
209     inline
operator +(const short_vec<float,32> & other) const210     short_vec<float, 32> operator+(const short_vec<float, 32>& other) const
211     {
212         return short_vec<float, 32>(
213             _mm_add_ps(val1, other.val1),
214             _mm_add_ps(val2, other.val2),
215             _mm_add_ps(val3, other.val3),
216             _mm_add_ps(val4, other.val4),
217             _mm_add_ps(val5, other.val5),
218             _mm_add_ps(val6, other.val6),
219             _mm_add_ps(val7, other.val7),
220             _mm_add_ps(val8, other.val8));
221     }
222 
223     inline
operator *=(const short_vec<float,32> & other)224     void operator*=(const short_vec<float, 32>& other)
225     {
226         val1 = _mm_mul_ps(val1, other.val1);
227         val2 = _mm_mul_ps(val2, other.val2);
228         val3 = _mm_mul_ps(val3, other.val3);
229         val4 = _mm_mul_ps(val4, other.val4);
230         val5 = _mm_mul_ps(val5, other.val5);
231         val6 = _mm_mul_ps(val6, other.val6);
232         val7 = _mm_mul_ps(val7, other.val7);
233         val8 = _mm_mul_ps(val8, other.val8);
234     }
235 
236     inline
operator *(const short_vec<float,32> & other) const237     short_vec<float, 32> operator*(const short_vec<float, 32>& other) const
238     {
239         return short_vec<float, 32>(
240             _mm_mul_ps(val1, other.val1),
241             _mm_mul_ps(val2, other.val2),
242             _mm_mul_ps(val3, other.val3),
243             _mm_mul_ps(val4, other.val4),
244             _mm_mul_ps(val5, other.val5),
245             _mm_mul_ps(val6, other.val6),
246             _mm_mul_ps(val7, other.val7),
247             _mm_mul_ps(val8, other.val8));
248     }
249 
250     inline
operator /=(const short_vec<float,32> & other)251     void operator/=(const short_vec<float, 32>& other)
252     {
253         val1 = _mm_div_ps(val1, other.val1);
254         val2 = _mm_div_ps(val2, other.val2);
255         val3 = _mm_div_ps(val3, other.val3);
256         val4 = _mm_div_ps(val4, other.val4);
257         val5 = _mm_div_ps(val5, other.val5);
258         val6 = _mm_div_ps(val6, other.val6);
259         val7 = _mm_div_ps(val7, other.val7);
260         val8 = _mm_div_ps(val8, other.val8);
261     }
262 
263     inline
264     void operator/=(const sqrt_reference<float, 32>& other);
265 
266     inline
operator /(const short_vec<float,32> & other) const267     short_vec<float, 32> operator/(const short_vec<float, 32>& other) const
268     {
269         return short_vec<float, 32>(
270             _mm_div_ps(val1, other.val1),
271             _mm_div_ps(val2, other.val2),
272             _mm_div_ps(val3, other.val3),
273             _mm_div_ps(val4, other.val4),
274             _mm_div_ps(val5, other.val5),
275             _mm_div_ps(val6, other.val6),
276             _mm_div_ps(val7, other.val7),
277             _mm_div_ps(val8, other.val8));
278     }
279 
280     inline
281     short_vec<float, 32> operator/(const sqrt_reference<float, 32>& other) const;
282 
283     inline
operator <(const short_vec<float,32> & other) const284     short_vec<float, 32> operator<(const short_vec<float, 32>& other) const
285     {
286         return short_vec<float, 32>(
287             _mm_cmplt_ps(val1, other.val1),
288             _mm_cmplt_ps(val2, other.val2),
289             _mm_cmplt_ps(val3, other.val3),
290             _mm_cmplt_ps(val4, other.val4),
291             _mm_cmplt_ps(val5, other.val5),
292             _mm_cmplt_ps(val6, other.val6),
293             _mm_cmplt_ps(val7, other.val7),
294             _mm_cmplt_ps(val8, other.val8));
295     }
296 
297     inline
operator <=(const short_vec<float,32> & other) const298     short_vec<float, 32> operator<=(const short_vec<float, 32>& other) const
299     {
300         return short_vec<float, 32>(
301             _mm_cmple_ps(val1, other.val1),
302             _mm_cmple_ps(val2, other.val2),
303             _mm_cmple_ps(val3, other.val3),
304             _mm_cmple_ps(val4, other.val4),
305             _mm_cmple_ps(val5, other.val5),
306             _mm_cmple_ps(val6, other.val6),
307             _mm_cmple_ps(val7, other.val7),
308             _mm_cmple_ps(val8, other.val8));
309     }
310 
311     inline
operator ==(const short_vec<float,32> & other) const312     short_vec<float, 32> operator==(const short_vec<float, 32>& other) const
313     {
314         return short_vec<float, 32>(
315             _mm_cmpeq_ps(val1, other.val1),
316             _mm_cmpeq_ps(val2, other.val2),
317             _mm_cmpeq_ps(val3, other.val3),
318             _mm_cmpeq_ps(val4, other.val4),
319             _mm_cmpeq_ps(val5, other.val5),
320             _mm_cmpeq_ps(val6, other.val6),
321             _mm_cmpeq_ps(val7, other.val7),
322             _mm_cmpeq_ps(val8, other.val8));
323     }
324 
325     inline
operator >(const short_vec<float,32> & other) const326     short_vec<float, 32> operator>(const short_vec<float, 32>& other) const
327     {
328         return short_vec<float, 32>(
329             _mm_cmpgt_ps(val1, other.val1),
330             _mm_cmpgt_ps(val2, other.val2),
331             _mm_cmpgt_ps(val3, other.val3),
332             _mm_cmpgt_ps(val4, other.val4),
333             _mm_cmpgt_ps(val5, other.val5),
334             _mm_cmpgt_ps(val6, other.val6),
335             _mm_cmpgt_ps(val7, other.val7),
336             _mm_cmpgt_ps(val8, other.val8));
337     }
338 
339     inline
operator >=(const short_vec<float,32> & other) const340     short_vec<float, 32> operator>=(const short_vec<float, 32>& other) const
341     {
342         return short_vec<float, 32>(
343             _mm_cmpge_ps(val1, other.val1),
344             _mm_cmpge_ps(val2, other.val2),
345             _mm_cmpge_ps(val3, other.val3),
346             _mm_cmpge_ps(val4, other.val4),
347             _mm_cmpge_ps(val5, other.val5),
348             _mm_cmpge_ps(val6, other.val6),
349             _mm_cmpge_ps(val7, other.val7),
350             _mm_cmpge_ps(val8, other.val8));
351     }
352 
353     inline
sqrt() const354     short_vec<float, 32> sqrt() const
355     {
356         return short_vec<float, 32>(
357             _mm_sqrt_ps(val1),
358             _mm_sqrt_ps(val2),
359             _mm_sqrt_ps(val3),
360             _mm_sqrt_ps(val4),
361             _mm_sqrt_ps(val5),
362             _mm_sqrt_ps(val6),
363             _mm_sqrt_ps(val7),
364             _mm_sqrt_ps(val8));
365     }
366 
367     inline
load(const float * data)368     void load(const float *data)
369     {
370         val1 = _mm_loadu_ps(data +  0);
371         val2 = _mm_loadu_ps(data +  4);
372         val3 = _mm_loadu_ps(data +  8);
373         val4 = _mm_loadu_ps(data + 12);
374         val5 = _mm_loadu_ps(data + 16);
375         val6 = _mm_loadu_ps(data + 20);
376         val7 = _mm_loadu_ps(data + 24);
377         val8 = _mm_loadu_ps(data + 28);
378     }
379 
380     inline
load_aligned(const float * data)381     void load_aligned(const float *data)
382     {
383         SHORTVEC_ASSERT_ALIGNED(data, 16);
384         val1 = _mm_load_ps(data +  0);
385         val2 = _mm_load_ps(data +  4);
386         val3 = _mm_load_ps(data +  8);
387         val4 = _mm_load_ps(data + 12);
388         val5 = _mm_load_ps(data + 16);
389         val6 = _mm_load_ps(data + 20);
390         val7 = _mm_load_ps(data + 24);
391         val8 = _mm_load_ps(data + 28);
392     }
393 
394     inline
store(float * data) const395     void store(float *data) const
396     {
397         _mm_storeu_ps(data +  0, val1);
398         _mm_storeu_ps(data +  4, val2);
399         _mm_storeu_ps(data +  8, val3);
400         _mm_storeu_ps(data + 12, val4);
401         _mm_storeu_ps(data + 16, val5);
402         _mm_storeu_ps(data + 20, val6);
403         _mm_storeu_ps(data + 24, val7);
404         _mm_storeu_ps(data + 28, val8);
405     }
406 
407     inline
store_aligned(float * data) const408     void store_aligned(float *data) const
409     {
410         SHORTVEC_ASSERT_ALIGNED(data, 16);
411         _mm_store_ps(data +  0, val1);
412         _mm_store_ps(data +  4, val2);
413         _mm_store_ps(data +  8, val3);
414         _mm_store_ps(data + 12, val4);
415         _mm_store_ps(data + 16, val5);
416         _mm_store_ps(data + 20, val6);
417         _mm_store_ps(data + 24, val7);
418         _mm_store_ps(data + 28, val8);
419     }
420 
421     inline
store_nt(float * data) const422     void store_nt(float *data) const
423     {
424         SHORTVEC_ASSERT_ALIGNED(data, 16);
425         _mm_stream_ps(data +  0, val1);
426         _mm_stream_ps(data +  4, val2);
427         _mm_stream_ps(data +  8, val3);
428         _mm_stream_ps(data + 12, val4);
429         _mm_stream_ps(data + 16, val5);
430         _mm_stream_ps(data + 20, val6);
431         _mm_stream_ps(data + 24, val7);
432         _mm_stream_ps(data + 28, val8);
433     }
434 
435 #ifdef __SSE4_1__
436     inline
gather(const float * ptr,const int * offsets)437     void gather(const float *ptr, const int *offsets)
438     {
439         val1 = _mm_load_ss(ptr + offsets[0]);
440         SHORTVEC_INSERT_PS(val1, ptr, offsets[ 1], _MM_MK_INSERTPS_NDX(0,1,0));
441         SHORTVEC_INSERT_PS(val1, ptr, offsets[ 2], _MM_MK_INSERTPS_NDX(0,2,0));
442         SHORTVEC_INSERT_PS(val1, ptr, offsets[ 3], _MM_MK_INSERTPS_NDX(0,3,0));
443 
444         val2 = _mm_load_ss(ptr + offsets[4]);
445         SHORTVEC_INSERT_PS(val2, ptr, offsets[ 5], _MM_MK_INSERTPS_NDX(0,1,0));
446         SHORTVEC_INSERT_PS(val2, ptr, offsets[ 6], _MM_MK_INSERTPS_NDX(0,2,0));
447         SHORTVEC_INSERT_PS(val2, ptr, offsets[ 7], _MM_MK_INSERTPS_NDX(0,3,0));
448 
449         val3 = _mm_load_ss(ptr + offsets[8]);
450         SHORTVEC_INSERT_PS(val3, ptr, offsets[ 9], _MM_MK_INSERTPS_NDX(0,1,0));
451         SHORTVEC_INSERT_PS(val3, ptr, offsets[10], _MM_MK_INSERTPS_NDX(0,2,0));
452         SHORTVEC_INSERT_PS(val3, ptr, offsets[11], _MM_MK_INSERTPS_NDX(0,3,0));
453 
454         val4 = _mm_load_ss(ptr + offsets[12]);
455         SHORTVEC_INSERT_PS(val4, ptr, offsets[13], _MM_MK_INSERTPS_NDX(0,1,0));
456         SHORTVEC_INSERT_PS(val4, ptr, offsets[14], _MM_MK_INSERTPS_NDX(0,2,0));
457         SHORTVEC_INSERT_PS(val4, ptr, offsets[15], _MM_MK_INSERTPS_NDX(0,3,0));
458 
459         val5 = _mm_load_ss(ptr + offsets[16]);
460         SHORTVEC_INSERT_PS(val5, ptr, offsets[17], _MM_MK_INSERTPS_NDX(0,1,0));
461         SHORTVEC_INSERT_PS(val5, ptr, offsets[18], _MM_MK_INSERTPS_NDX(0,2,0));
462         SHORTVEC_INSERT_PS(val5, ptr, offsets[19], _MM_MK_INSERTPS_NDX(0,3,0));
463 
464         val6 = _mm_load_ss(ptr + offsets[20]);
465         SHORTVEC_INSERT_PS(val6, ptr, offsets[21], _MM_MK_INSERTPS_NDX(0,1,0));
466         SHORTVEC_INSERT_PS(val6, ptr, offsets[22], _MM_MK_INSERTPS_NDX(0,2,0));
467         SHORTVEC_INSERT_PS(val6, ptr, offsets[23], _MM_MK_INSERTPS_NDX(0,3,0));
468 
469         val7 = _mm_load_ss(ptr + offsets[24]);
470         SHORTVEC_INSERT_PS(val7, ptr, offsets[25], _MM_MK_INSERTPS_NDX(0,1,0));
471         SHORTVEC_INSERT_PS(val7, ptr, offsets[26], _MM_MK_INSERTPS_NDX(0,2,0));
472         SHORTVEC_INSERT_PS(val7, ptr, offsets[27], _MM_MK_INSERTPS_NDX(0,3,0));
473 
474         val8 = _mm_load_ss(ptr + offsets[28]);
475         SHORTVEC_INSERT_PS(val8, ptr, offsets[29], _MM_MK_INSERTPS_NDX(0,1,0));
476         SHORTVEC_INSERT_PS(val8, ptr, offsets[30], _MM_MK_INSERTPS_NDX(0,2,0));
477         SHORTVEC_INSERT_PS(val8, ptr, offsets[31], _MM_MK_INSERTPS_NDX(0,3,0));
478     }
479 
480     inline
scatter(float * ptr,const int * offsets) const481     void scatter(float *ptr, const int *offsets) const
482     {
483         ShortVecHelpers::ExtractResult r1, r2, r3, r4;
484         r1.i = _mm_extract_ps(val1, 0);
485         r2.i = _mm_extract_ps(val1, 1);
486         r3.i = _mm_extract_ps(val1, 2);
487         r4.i = _mm_extract_ps(val1, 3);
488         ptr[offsets[0]] = r1.f;
489         ptr[offsets[1]] = r2.f;
490         ptr[offsets[2]] = r3.f;
491         ptr[offsets[3]] = r4.f;
492 
493         r1.i = _mm_extract_ps(val2, 0);
494         r2.i = _mm_extract_ps(val2, 1);
495         r3.i = _mm_extract_ps(val2, 2);
496         r4.i = _mm_extract_ps(val2, 3);
497         ptr[offsets[4]] = r1.f;
498         ptr[offsets[5]] = r2.f;
499         ptr[offsets[6]] = r3.f;
500         ptr[offsets[7]] = r4.f;
501 
502         r1.i = _mm_extract_ps(val3, 0);
503         r2.i = _mm_extract_ps(val3, 1);
504         r3.i = _mm_extract_ps(val3, 2);
505         r4.i = _mm_extract_ps(val3, 3);
506         ptr[offsets[ 8]] = r1.f;
507         ptr[offsets[ 9]] = r2.f;
508         ptr[offsets[10]] = r3.f;
509         ptr[offsets[11]] = r4.f;
510 
511         r1.i = _mm_extract_ps(val4, 0);
512         r2.i = _mm_extract_ps(val4, 1);
513         r3.i = _mm_extract_ps(val4, 2);
514         r4.i = _mm_extract_ps(val4, 3);
515         ptr[offsets[12]] = r1.f;
516         ptr[offsets[13]] = r2.f;
517         ptr[offsets[14]] = r3.f;
518         ptr[offsets[15]] = r4.f;
519 
520         r1.i = _mm_extract_ps(val5, 0);
521         r2.i = _mm_extract_ps(val5, 1);
522         r3.i = _mm_extract_ps(val5, 2);
523         r4.i = _mm_extract_ps(val5, 3);
524         ptr[offsets[16]] = r1.f;
525         ptr[offsets[17]] = r2.f;
526         ptr[offsets[18]] = r3.f;
527         ptr[offsets[19]] = r4.f;
528 
529         r1.i = _mm_extract_ps(val6, 0);
530         r2.i = _mm_extract_ps(val6, 1);
531         r3.i = _mm_extract_ps(val6, 2);
532         r4.i = _mm_extract_ps(val6, 3);
533         ptr[offsets[20]] = r1.f;
534         ptr[offsets[21]] = r2.f;
535         ptr[offsets[22]] = r3.f;
536         ptr[offsets[23]] = r4.f;
537 
538         r1.i = _mm_extract_ps(val7, 0);
539         r2.i = _mm_extract_ps(val7, 1);
540         r3.i = _mm_extract_ps(val7, 2);
541         r4.i = _mm_extract_ps(val7, 3);
542         ptr[offsets[24]] = r1.f;
543         ptr[offsets[25]] = r2.f;
544         ptr[offsets[26]] = r3.f;
545         ptr[offsets[27]] = r4.f;
546 
547         r1.i = _mm_extract_ps(val8, 0);
548         r2.i = _mm_extract_ps(val8, 1);
549         r3.i = _mm_extract_ps(val8, 2);
550         r4.i = _mm_extract_ps(val8, 3);
551         ptr[offsets[28]] = r1.f;
552         ptr[offsets[29]] = r2.f;
553         ptr[offsets[30]] = r3.f;
554         ptr[offsets[31]] = r4.f;
555     }
556 #else
557     inline
gather(const float * ptr,const int * offsets)558     void gather(const float *ptr, const int *offsets)
559     {
560         __m128 f1, f2, f3, f4;
561         f1   = _mm_load_ss(ptr + offsets[0]);
562         f2   = _mm_load_ss(ptr + offsets[2]);
563         f1   = _mm_unpacklo_ps(f1, f2);
564         f3   = _mm_load_ss(ptr + offsets[1]);
565         f4   = _mm_load_ss(ptr + offsets[3]);
566         f3   = _mm_unpacklo_ps(f3, f4);
567         val1 = _mm_unpacklo_ps(f1, f3);
568 
569         f1   = _mm_load_ss(ptr + offsets[4]);
570         f2   = _mm_load_ss(ptr + offsets[6]);
571         f1   = _mm_unpacklo_ps(f1, f2);
572         f3   = _mm_load_ss(ptr + offsets[5]);
573         f4   = _mm_load_ss(ptr + offsets[7]);
574         f3   = _mm_unpacklo_ps(f3, f4);
575         val2 = _mm_unpacklo_ps(f1, f3);
576 
577         f1   = _mm_load_ss(ptr + offsets[ 8]);
578         f2   = _mm_load_ss(ptr + offsets[10]);
579         f1   = _mm_unpacklo_ps(f1, f2);
580         f3   = _mm_load_ss(ptr + offsets[ 9]);
581         f4   = _mm_load_ss(ptr + offsets[11]);
582         f3   = _mm_unpacklo_ps(f3, f4);
583         val3 = _mm_unpacklo_ps(f1, f3);
584 
585         f1   = _mm_load_ss(ptr + offsets[12]);
586         f2   = _mm_load_ss(ptr + offsets[14]);
587         f1   = _mm_unpacklo_ps(f1, f2);
588         f3   = _mm_load_ss(ptr + offsets[13]);
589         f4   = _mm_load_ss(ptr + offsets[15]);
590         f3   = _mm_unpacklo_ps(f3, f4);
591         val4 = _mm_unpacklo_ps(f1, f3);
592 
593         f1   = _mm_load_ss(ptr + offsets[16]);
594         f2   = _mm_load_ss(ptr + offsets[18]);
595         f1   = _mm_unpacklo_ps(f1, f2);
596         f3   = _mm_load_ss(ptr + offsets[17]);
597         f4   = _mm_load_ss(ptr + offsets[19]);
598         f3   = _mm_unpacklo_ps(f3, f4);
599         val5 = _mm_unpacklo_ps(f1, f3);
600 
601         f1   = _mm_load_ss(ptr + offsets[20]);
602         f2   = _mm_load_ss(ptr + offsets[22]);
603         f1   = _mm_unpacklo_ps(f1, f2);
604         f3   = _mm_load_ss(ptr + offsets[21]);
605         f4   = _mm_load_ss(ptr + offsets[23]);
606         f3   = _mm_unpacklo_ps(f3, f4);
607         val6 = _mm_unpacklo_ps(f1, f3);
608 
609         f1   = _mm_load_ss(ptr + offsets[24]);
610         f2   = _mm_load_ss(ptr + offsets[26]);
611         f1   = _mm_unpacklo_ps(f1, f2);
612         f3   = _mm_load_ss(ptr + offsets[25]);
613         f4   = _mm_load_ss(ptr + offsets[27]);
614         f3   = _mm_unpacklo_ps(f3, f4);
615         val7 = _mm_unpacklo_ps(f1, f3);
616 
617         f1   = _mm_load_ss(ptr + offsets[28]);
618         f2   = _mm_load_ss(ptr + offsets[30]);
619         f1   = _mm_unpacklo_ps(f1, f2);
620         f3   = _mm_load_ss(ptr + offsets[29]);
621         f4   = _mm_load_ss(ptr + offsets[31]);
622         f3   = _mm_unpacklo_ps(f3, f4);
623         val8 = _mm_unpacklo_ps(f1, f3);
624 
625     }
626 
627     inline
scatter(float * ptr,const int * offsets) const628     void scatter(float *ptr, const int *offsets) const
629     {
630         __m128 tmp = val1;
631         _mm_store_ss(ptr + offsets[0], tmp);
632         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
633         _mm_store_ss(ptr + offsets[1], tmp);
634         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
635         _mm_store_ss(ptr + offsets[2], tmp);
636         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
637         _mm_store_ss(ptr + offsets[3], tmp);
638 
639         tmp = val2;
640         _mm_store_ss(ptr + offsets[4], tmp);
641         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
642         _mm_store_ss(ptr + offsets[5], tmp);
643         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
644         _mm_store_ss(ptr + offsets[6], tmp);
645         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
646         _mm_store_ss(ptr + offsets[7], tmp);
647 
648         tmp = val3;
649         _mm_store_ss(ptr + offsets[8], tmp);
650         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
651         _mm_store_ss(ptr + offsets[9], tmp);
652         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
653         _mm_store_ss(ptr + offsets[10], tmp);
654         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
655         _mm_store_ss(ptr + offsets[11], tmp);
656 
657         tmp = val4;
658         _mm_store_ss(ptr + offsets[12], tmp);
659         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
660         _mm_store_ss(ptr + offsets[13], tmp);
661         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
662         _mm_store_ss(ptr + offsets[14], tmp);
663         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
664         _mm_store_ss(ptr + offsets[15], tmp);
665 
666         tmp = val5;
667         _mm_store_ss(ptr + offsets[16], tmp);
668         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
669         _mm_store_ss(ptr + offsets[17], tmp);
670         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
671         _mm_store_ss(ptr + offsets[18], tmp);
672         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
673         _mm_store_ss(ptr + offsets[19], tmp);
674 
675         tmp = val6;
676         _mm_store_ss(ptr + offsets[20], tmp);
677         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
678         _mm_store_ss(ptr + offsets[21], tmp);
679         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
680         _mm_store_ss(ptr + offsets[22], tmp);
681         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
682         _mm_store_ss(ptr + offsets[23], tmp);
683 
684         tmp = val7;
685         _mm_store_ss(ptr + offsets[24], tmp);
686         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
687         _mm_store_ss(ptr + offsets[25], tmp);
688         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
689         _mm_store_ss(ptr + offsets[26], tmp);
690         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
691         _mm_store_ss(ptr + offsets[27], tmp);
692 
693         tmp = val8;
694         _mm_store_ss(ptr + offsets[28], tmp);
695         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
696         _mm_store_ss(ptr + offsets[29], tmp);
697         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
698         _mm_store_ss(ptr + offsets[30], tmp);
699         tmp = _mm_shuffle_ps(tmp, tmp, _MM_SHUFFLE(0,3,2,1));
700         _mm_store_ss(ptr + offsets[31], tmp);
701    }
702 #endif
703 
704 private:
705     __m128 val1;
706     __m128 val2;
707     __m128 val3;
708     __m128 val4;
709     __m128 val5;
710     __m128 val6;
711     __m128 val7;
712     __m128 val8;
713 };
714 
715 inline
operator <<(float * data,const short_vec<float,32> & vec)716 void operator<<(float *data, const short_vec<float, 32>& vec)
717 {
718     vec.store(data);
719 }
720 
721 template<>
722 class sqrt_reference<float, 32>
723 {
724 public:
725     template<typename OTHER_CARGO, int OTHER_ARITY>
726     friend class short_vec;
727 
sqrt_reference(const short_vec<float,32> & vec)728     sqrt_reference(const short_vec<float, 32>& vec) :
729         vec(vec)
730     {}
731 
732 private:
733     short_vec<float, 32> vec;
734 };
735 
736 #ifdef __ICC
737 #pragma warning pop
738 #endif
739 
740 inline
short_vec(const sqrt_reference<float,32> & other)741 short_vec<float, 32>::short_vec(const sqrt_reference<float, 32>& other) :
742     val1(_mm_sqrt_ps(other.vec.val1)),
743     val2(_mm_sqrt_ps(other.vec.val2)),
744     val3(_mm_sqrt_ps(other.vec.val3)),
745     val4(_mm_sqrt_ps(other.vec.val4)),
746     val5(_mm_sqrt_ps(other.vec.val5)),
747     val6(_mm_sqrt_ps(other.vec.val6)),
748     val7(_mm_sqrt_ps(other.vec.val7)),
749     val8(_mm_sqrt_ps(other.vec.val8))
750 {}
751 
752 inline
operator /=(const sqrt_reference<float,32> & other)753 void short_vec<float, 32>::operator/=(const sqrt_reference<float, 32>& other)
754 {
755     val1 = _mm_mul_ps(val1, _mm_rsqrt_ps(other.vec.val1));
756     val2 = _mm_mul_ps(val2, _mm_rsqrt_ps(other.vec.val2));
757     val3 = _mm_mul_ps(val3, _mm_rsqrt_ps(other.vec.val3));
758     val4 = _mm_mul_ps(val4, _mm_rsqrt_ps(other.vec.val4));
759     val5 = _mm_mul_ps(val5, _mm_rsqrt_ps(other.vec.val5));
760     val6 = _mm_mul_ps(val6, _mm_rsqrt_ps(other.vec.val6));
761     val7 = _mm_mul_ps(val7, _mm_rsqrt_ps(other.vec.val7));
762     val8 = _mm_mul_ps(val8, _mm_rsqrt_ps(other.vec.val8));
763 }
764 
765 inline
operator /(const sqrt_reference<float,32> & other) const766 short_vec<float, 32> short_vec<float, 32>::operator/(const sqrt_reference<float, 32>& other) const
767 {
768     return short_vec<float, 32>(
769         _mm_mul_ps(val1, _mm_rsqrt_ps(other.vec.val1)),
770         _mm_mul_ps(val2, _mm_rsqrt_ps(other.vec.val2)),
771         _mm_mul_ps(val3, _mm_rsqrt_ps(other.vec.val3)),
772         _mm_mul_ps(val4, _mm_rsqrt_ps(other.vec.val4)),
773         _mm_mul_ps(val5, _mm_rsqrt_ps(other.vec.val5)),
774         _mm_mul_ps(val6, _mm_rsqrt_ps(other.vec.val6)),
775         _mm_mul_ps(val7, _mm_rsqrt_ps(other.vec.val7)),
776         _mm_mul_ps(val8, _mm_rsqrt_ps(other.vec.val8)));
777 }
778 
779 inline
sqrt(const short_vec<float,32> & vec)780 sqrt_reference<float, 32> sqrt(const short_vec<float, 32>& vec)
781 {
782     return sqrt_reference<float, 32>(vec);
783 }
784 
785 template<typename _CharT, typename _Traits>
786 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<float,32> & vec)787 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
788            const short_vec<float, 32>& vec)
789 {
790     const float *data1 = reinterpret_cast<const float *>(&vec.val1);
791     const float *data2 = reinterpret_cast<const float *>(&vec.val2);
792     const float *data3 = reinterpret_cast<const float *>(&vec.val3);
793     const float *data4 = reinterpret_cast<const float *>(&vec.val4);
794     const float *data5 = reinterpret_cast<const float *>(&vec.val5);
795     const float *data6 = reinterpret_cast<const float *>(&vec.val6);
796     const float *data7 = reinterpret_cast<const float *>(&vec.val7);
797     const float *data8 = reinterpret_cast<const float *>(&vec.val8);
798     __os << "["
799          << data1[0] << ", " << data1[1]  << ", " << data1[2] << ", " << data1[3] << ", "
800          << data2[0] << ", " << data2[1]  << ", " << data2[2] << ", " << data2[3] << ", "
801          << data3[0] << ", " << data3[1]  << ", " << data3[2] << ", " << data3[3] << ", "
802          << data4[0] << ", " << data4[1]  << ", " << data4[2] << ", " << data4[3] << ", "
803          << data5[0] << ", " << data5[1]  << ", " << data5[2] << ", " << data5[3] << ", "
804          << data6[0] << ", " << data6[1]  << ", " << data6[2] << ", " << data6[3] << ", "
805          << data7[0] << ", " << data7[1]  << ", " << data7[2] << ", " << data7[3] << ", "
806          << data8[0] << ", " << data8[1]  << ", " << data8[2] << ", " << data8[3] << "]";
807     return __os;
808 }
809 
810 }
811 
812 #endif
813 
814 #endif
815