1 /**
2  * Copyright 2016 Andreas Schäfer
3  * Copyright 2015 Kurt Kanzenbach
4  *
5  * Distributed under the Boost Software License, Version 1.0. (See accompanying
6  * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7  */
8 
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_INT_16_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_INT_16_HPP
11 
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) ||             \
13     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) ||            \
14     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1) ||          \
15     (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_AVX)
16 
17 #include <emmintrin.h>
18 #include <libflatarray/detail/sqrt_reference.hpp>
19 #include <libflatarray/detail/short_vec_helpers.hpp>
20 #include <libflatarray/config.h>
21 #include <iostream>
22 
23 #ifdef __SSE4_1__
24 #include <smmintrin.h>
25 #endif
26 
27 #ifdef LIBFLATARRAY_WITH_CPP14
28 #include <initializer_list>
29 #endif
30 
31 namespace LibFlatArray {
32 
33 template<typename CARGO, int ARITY>
34 class short_vec;
35 
36 template<typename CARGO, int ARITY>
37 class sqrt_reference;
38 
39 #ifdef __ICC
40 // disabling this warning as implicit type conversion is exactly our goal here:
41 #pragma warning push
42 #pragma warning (disable: 2304)
43 #endif
44 
45 template<>
46 class short_vec<int, 16>
47 {
48 public:
49     static const int ARITY = 16;
50 
51     typedef short_vec_strategy::sse strategy;
52 
53     template<typename _CharT, typename _Traits>
54     friend std::basic_ostream<_CharT, _Traits>& operator<<(
55         std::basic_ostream<_CharT, _Traits>& __os,
56         const short_vec<int, 16>& vec);
57 
58     inline
short_vec(const int data=0)59     short_vec(const int data = 0) :
60         val1(_mm_set1_epi32(data)),
61         val2(_mm_set1_epi32(data)),
62         val3(_mm_set1_epi32(data)),
63         val4(_mm_set1_epi32(data))
64     {}
65 
66     inline
short_vec(const int * data)67     short_vec(const int *data)
68     {
69         load(data);
70     }
71 
72     inline
short_vec(const __m128i & val1,const __m128i & val2,const __m128i & val3,const __m128i & val4)73     short_vec(const __m128i& val1, const __m128i& val2,
74               const __m128i& val3, const __m128i& val4) :
75         val1(val1),
76         val2(val2),
77         val3(val3),
78         val4(val4)
79     {}
80 
81 #ifdef LIBFLATARRAY_WITH_CPP14
82     inline
short_vec(const std::initializer_list<int> & il)83     short_vec(const std::initializer_list<int>& il)
84     {
85         const int *ptr = static_cast<const int *>(&(*il.begin()));
86         load(ptr);
87     }
88 #endif
89 
90     inline
91     short_vec(const sqrt_reference<int, 16>& other);
92 
93     inline
operator -=(const short_vec<int,16> & other)94     void operator-=(const short_vec<int, 16>& other)
95     {
96         val1 = _mm_sub_epi32(val1, other.val1);
97         val2 = _mm_sub_epi32(val2, other.val2);
98         val3 = _mm_sub_epi32(val3, other.val3);
99         val4 = _mm_sub_epi32(val4, other.val4);
100     }
101 
102     inline
operator -(const short_vec<int,16> & other) const103     short_vec<int, 16> operator-(const short_vec<int, 16>& other) const
104     {
105         return short_vec<int, 16>(
106             _mm_sub_epi32(val1, other.val1),
107             _mm_sub_epi32(val2, other.val2),
108             _mm_sub_epi32(val3, other.val3),
109             _mm_sub_epi32(val4, other.val4));
110     }
111 
112     inline
operator +=(const short_vec<int,16> & other)113     void operator+=(const short_vec<int, 16>& other)
114     {
115         val1 = _mm_add_epi32(val1, other.val1);
116         val2 = _mm_add_epi32(val2, other.val2);
117         val3 = _mm_add_epi32(val3, other.val3);
118         val4 = _mm_add_epi32(val4, other.val4);
119     }
120 
121     inline
operator +(const short_vec<int,16> & other) const122     short_vec<int, 16> operator+(const short_vec<int, 16>& other) const
123     {
124         return short_vec<int, 16>(
125             _mm_add_epi32(val1, other.val1),
126             _mm_add_epi32(val2, other.val2),
127             _mm_add_epi32(val3, other.val3),
128             _mm_add_epi32(val4, other.val4));
129     }
130 
131 #ifdef __SSE4_1__
132     inline
operator *=(const short_vec<int,16> & other)133     void operator*=(const short_vec<int, 16>& other)
134     {
135         val1 = _mm_mullo_epi32(val1, other.val1);
136         val2 = _mm_mullo_epi32(val2, other.val2);
137         val3 = _mm_mullo_epi32(val3, other.val3);
138         val4 = _mm_mullo_epi32(val4, other.val4);
139     }
140 
141     inline
operator *(const short_vec<int,16> & other) const142     short_vec<int, 16> operator*(const short_vec<int, 16>& other) const
143     {
144         return short_vec<int, 16>(
145             _mm_mullo_epi32(val1, other.val1),
146             _mm_mullo_epi32(val2, other.val2),
147             _mm_mullo_epi32(val3, other.val3),
148             _mm_mullo_epi32(val4, other.val4));
149     }
150 #else
151     inline
operator *=(const short_vec<int,16> & other)152     void operator*=(const short_vec<int, 16>& other)
153     {
154         // see: https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
155         __m128i tmp1 = _mm_mul_epu32(val1, other.val1);
156         __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(val1, 4),
157                                      _mm_srli_si128(other.val1, 4));
158         val1 = _mm_unpacklo_epi32(
159             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
160             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
161 
162         tmp1 = _mm_mul_epu32(val2, other.val2);
163         tmp2 = _mm_mul_epu32(_mm_srli_si128(val2, 4),
164                              _mm_srli_si128(other.val2, 4));
165         val2 = _mm_unpacklo_epi32(
166             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
167             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
168 
169         tmp1 = _mm_mul_epu32(val3, other.val3);
170         tmp2 = _mm_mul_epu32(_mm_srli_si128(val3, 4),
171                              _mm_srli_si128(other.val3, 4));
172         val3 = _mm_unpacklo_epi32(
173             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
174             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
175 
176         tmp1 = _mm_mul_epu32(val4, other.val4);
177         tmp2 = _mm_mul_epu32(_mm_srli_si128(val4, 4),
178                              _mm_srli_si128(other.val4, 4));
179         val4 = _mm_unpacklo_epi32(
180             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
181             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
182     }
183 
184     inline
operator *(const short_vec<int,16> & other) const185     short_vec<int, 16> operator*(const short_vec<int, 16>& other) const
186     {
187         // see: https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
188         __m128i tmp1 = _mm_mul_epu32(val1, other.val1);
189         __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(val1, 4),
190                                      _mm_srli_si128(other.val1, 4));
191         __m128i result1 = _mm_unpacklo_epi32(
192             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
193             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
194 
195         tmp1 = _mm_mul_epu32(val2, other.val2);
196         tmp2 = _mm_mul_epu32(_mm_srli_si128(val2, 4),
197                              _mm_srli_si128(other.val2, 4));
198         __m128i result2 = _mm_unpacklo_epi32(
199             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
200             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
201 
202         tmp1 = _mm_mul_epu32(val3, other.val3);
203         tmp2 = _mm_mul_epu32(_mm_srli_si128(val3, 4),
204                              _mm_srli_si128(other.val3, 4));
205         __m128i result3 = _mm_unpacklo_epi32(
206             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
207             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
208 
209         tmp1 = _mm_mul_epu32(val4, other.val4);
210         tmp2 = _mm_mul_epu32(_mm_srli_si128(val4, 4),
211                              _mm_srli_si128(other.val4, 4));
212         __m128i result4 = _mm_unpacklo_epi32(
213             _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
214             _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
215 
216         return short_vec<int, 16>(result1, result2, result3, result4);
217     }
218 #endif
219 
220     inline
operator /=(const short_vec<int,16> & other)221     void operator/=(const short_vec<int, 16>& other)
222     {
223         val1 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val1),
224                                           _mm_cvtepi32_ps(other.val1)));
225         val2 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val2),
226                                           _mm_cvtepi32_ps(other.val2)));
227         val3 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val3),
228                                           _mm_cvtepi32_ps(other.val3)));
229         val4 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val4),
230                                           _mm_cvtepi32_ps(other.val4)));
231     }
232 
233     inline
234     void operator/=(const sqrt_reference<int, 16>& other);
235 
236     inline
operator /(const short_vec<int,16> & other) const237     short_vec<int, 16> operator/(const short_vec<int, 16>& other) const
238     {
239         return short_vec<int, 16>(
240             _mm_cvttps_epi32(_mm_div_ps(
241                                  _mm_cvtepi32_ps(val1),
242                                  _mm_cvtepi32_ps(other.val1))),
243             _mm_cvttps_epi32(_mm_div_ps(
244                                  _mm_cvtepi32_ps(val2),
245                                  _mm_cvtepi32_ps(other.val2))),
246             _mm_cvttps_epi32(_mm_div_ps(
247                                  _mm_cvtepi32_ps(val3),
248                                  _mm_cvtepi32_ps(other.val3))),
249             _mm_cvttps_epi32(_mm_div_ps(
250                                  _mm_cvtepi32_ps(val4),
251                                  _mm_cvtepi32_ps(other.val4))));
252     }
253 
254     inline
255     short_vec<int, 16> operator/(const sqrt_reference<int, 16>& other) const;
256 
257     inline
sqrt() const258     short_vec<int, 16> sqrt() const
259     {
260         return short_vec<int, 16>(
261             _mm_cvtps_epi32(
262                 _mm_sqrt_ps(_mm_cvtepi32_ps(val1))),
263             _mm_cvtps_epi32(
264                 _mm_sqrt_ps(_mm_cvtepi32_ps(val2))),
265             _mm_cvtps_epi32(
266                 _mm_sqrt_ps(_mm_cvtepi32_ps(val3))),
267             _mm_cvtps_epi32(
268                 _mm_sqrt_ps(_mm_cvtepi32_ps(val4))));
269     }
270 
271     inline
load(const int * data)272     void load(const int *data)
273     {
274         val1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data +  0));
275         val2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data +  4));
276         val3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data +  8));
277         val4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + 12));
278     }
279 
280     inline
load_aligned(const int * data)281     void load_aligned(const int *data)
282     {
283         SHORTVEC_ASSERT_ALIGNED(data, 16);
284         val1 = _mm_load_si128(reinterpret_cast<const __m128i *>(data +  0));
285         val2 = _mm_load_si128(reinterpret_cast<const __m128i *>(data +  4));
286         val3 = _mm_load_si128(reinterpret_cast<const __m128i *>(data +  8));
287         val4 = _mm_load_si128(reinterpret_cast<const __m128i *>(data + 12));
288     }
289 
290     inline
store(int * data) const291     void store(int *data) const
292     {
293         _mm_storeu_si128(reinterpret_cast<__m128i *>(data +  0), val1);
294         _mm_storeu_si128(reinterpret_cast<__m128i *>(data +  4), val2);
295         _mm_storeu_si128(reinterpret_cast<__m128i *>(data +  8), val3);
296         _mm_storeu_si128(reinterpret_cast<__m128i *>(data + 12), val4);
297     }
298 
299     inline
store_aligned(int * data) const300     void store_aligned(int *data) const
301     {
302         SHORTVEC_ASSERT_ALIGNED(data, 16);
303         _mm_store_si128(reinterpret_cast<__m128i *>(data +  0), val1);
304         _mm_store_si128(reinterpret_cast<__m128i *>(data +  4), val2);
305         _mm_store_si128(reinterpret_cast<__m128i *>(data +  8), val3);
306         _mm_store_si128(reinterpret_cast<__m128i *>(data + 12), val4);
307     }
308 
309     inline
store_nt(int * data) const310     void store_nt(int *data) const
311     {
312         SHORTVEC_ASSERT_ALIGNED(data, 16);
313         _mm_stream_si128(reinterpret_cast<__m128i *>(data +  0), val1);
314         _mm_stream_si128(reinterpret_cast<__m128i *>(data +  4), val2);
315         _mm_stream_si128(reinterpret_cast<__m128i *>(data +  8), val3);
316         _mm_stream_si128(reinterpret_cast<__m128i *>(data + 12), val4);
317     }
318 
319 #ifdef __SSE4_1__
320     inline
gather(const int * ptr,const int * offsets)321     void gather(const int *ptr, const int *offsets)
322     {
323         val1 = _mm_insert_epi32(val1, ptr[offsets[ 0]], 0);
324         val1 = _mm_insert_epi32(val1, ptr[offsets[ 1]], 1);
325         val1 = _mm_insert_epi32(val1, ptr[offsets[ 2]], 2);
326         val1 = _mm_insert_epi32(val1, ptr[offsets[ 3]], 3);
327 
328         val2 = _mm_insert_epi32(val2, ptr[offsets[ 4]], 0);
329         val2 = _mm_insert_epi32(val2, ptr[offsets[ 5]], 1);
330         val2 = _mm_insert_epi32(val2, ptr[offsets[ 6]], 2);
331         val2 = _mm_insert_epi32(val2, ptr[offsets[ 7]], 3);
332 
333         val3 = _mm_insert_epi32(val3, ptr[offsets[ 8]], 0);
334         val3 = _mm_insert_epi32(val3, ptr[offsets[ 9]], 1);
335         val3 = _mm_insert_epi32(val3, ptr[offsets[10]], 2);
336         val3 = _mm_insert_epi32(val3, ptr[offsets[11]], 3);
337 
338         val4 = _mm_insert_epi32(val4, ptr[offsets[12]], 0);
339         val4 = _mm_insert_epi32(val4, ptr[offsets[13]], 1);
340         val4 = _mm_insert_epi32(val4, ptr[offsets[14]], 2);
341         val4 = _mm_insert_epi32(val4, ptr[offsets[15]], 3);
342     }
343 
344     inline
scatter(int * ptr,const int * offsets) const345     void scatter(int *ptr, const int *offsets) const
346     {
347         ptr[offsets[ 0]] = _mm_extract_epi32(val1, 0);
348         ptr[offsets[ 1]] = _mm_extract_epi32(val1, 1);
349         ptr[offsets[ 2]] = _mm_extract_epi32(val1, 2);
350         ptr[offsets[ 3]] = _mm_extract_epi32(val1, 3);
351 
352         ptr[offsets[ 4]] = _mm_extract_epi32(val2, 0);
353         ptr[offsets[ 5]] = _mm_extract_epi32(val2, 1);
354         ptr[offsets[ 6]] = _mm_extract_epi32(val2, 2);
355         ptr[offsets[ 7]] = _mm_extract_epi32(val2, 3);
356 
357         ptr[offsets[ 8]] = _mm_extract_epi32(val3, 0);
358         ptr[offsets[ 9]] = _mm_extract_epi32(val3, 1);
359         ptr[offsets[10]] = _mm_extract_epi32(val3, 2);
360         ptr[offsets[11]] = _mm_extract_epi32(val3, 3);
361 
362         ptr[offsets[12]] = _mm_extract_epi32(val4, 0);
363         ptr[offsets[13]] = _mm_extract_epi32(val4, 1);
364         ptr[offsets[14]] = _mm_extract_epi32(val4, 2);
365         ptr[offsets[15]] = _mm_extract_epi32(val4, 3);
366     }
367 #else
368     inline
gather(const int * ptr,const int * offsets)369     void gather(const int *ptr, const int *offsets)
370     {
371         __m128i i2, i3, i4;
372         val1 = _mm_cvtsi32_si128(ptr[offsets[0]]);
373         i2   = _mm_cvtsi32_si128(ptr[offsets[1]]);
374         i3   = _mm_cvtsi32_si128(ptr[offsets[2]]);
375         i4   = _mm_cvtsi32_si128(ptr[offsets[3]]);
376         val1 = _mm_unpacklo_epi32(val1, i3);
377         i3   = _mm_unpacklo_epi32(i2  , i4);
378         val1 = _mm_unpacklo_epi32(val1, i3);
379 
380         val2 = _mm_cvtsi32_si128(ptr[offsets[4]]);
381         i2   = _mm_cvtsi32_si128(ptr[offsets[5]]);
382         i3   = _mm_cvtsi32_si128(ptr[offsets[6]]);
383         i4   = _mm_cvtsi32_si128(ptr[offsets[7]]);
384         val2 = _mm_unpacklo_epi32(val2, i3);
385         i3   = _mm_unpacklo_epi32(i2  , i4);
386         val2 = _mm_unpacklo_epi32(val2, i3);
387 
388         val3 = _mm_cvtsi32_si128(ptr[offsets[ 8]]);
389         i2   = _mm_cvtsi32_si128(ptr[offsets[ 9]]);
390         i3   = _mm_cvtsi32_si128(ptr[offsets[10]]);
391         i4   = _mm_cvtsi32_si128(ptr[offsets[11]]);
392         val3 = _mm_unpacklo_epi32(val3, i3);
393         i3   = _mm_unpacklo_epi32(i2  , i4);
394         val3 = _mm_unpacklo_epi32(val3, i3);
395 
396         val4 = _mm_cvtsi32_si128(ptr[offsets[12]]);
397         i2   = _mm_cvtsi32_si128(ptr[offsets[13]]);
398         i3   = _mm_cvtsi32_si128(ptr[offsets[14]]);
399         i4   = _mm_cvtsi32_si128(ptr[offsets[15]]);
400         val4 = _mm_unpacklo_epi32(val4, i3);
401         i3   = _mm_unpacklo_epi32(i2  , i4);
402         val4 = _mm_unpacklo_epi32(val4, i3);
403     }
404 
405     inline
scatter(int * ptr,const int * offsets) const406     void scatter(int *ptr, const int *offsets) const
407     {
408         ptr[offsets[ 0]] = _mm_cvtsi128_si32(val1);
409         ptr[offsets[ 1]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(0,3,2,1)));
410         ptr[offsets[ 2]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(1,0,3,2)));
411         ptr[offsets[ 3]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(2,1,0,3)));
412 
413         ptr[offsets[ 4]] = _mm_cvtsi128_si32(val2);
414         ptr[offsets[ 5]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(0,3,2,1)));
415         ptr[offsets[ 6]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(1,0,3,2)));
416         ptr[offsets[ 7]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(2,1,0,3)));
417 
418         ptr[offsets[ 8]] = _mm_cvtsi128_si32(val3);
419         ptr[offsets[ 9]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(0,3,2,1)));
420         ptr[offsets[10]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(1,0,3,2)));
421         ptr[offsets[11]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(2,1,0,3)));
422 
423         ptr[offsets[12]] = _mm_cvtsi128_si32(val4);
424         ptr[offsets[13]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(0,3,2,1)));
425         ptr[offsets[14]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(1,0,3,2)));
426         ptr[offsets[15]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(2,1,0,3)));
427     }
428 #endif
429 
430 private:
431     __m128i val1;
432     __m128i val2;
433     __m128i val3;
434     __m128i val4;
435 };
436 
437 inline
operator <<(int * data,const short_vec<int,16> & vec)438 void operator<<(int *data, const short_vec<int, 16>& vec)
439 {
440     vec.store(data);
441 }
442 
443 template<>
444 class sqrt_reference<int, 16>
445 {
446 public:
447     template<typename OTHER_CARGO, int OTHER_ARITY>
448     friend class short_vec;
449 
sqrt_reference(const short_vec<int,16> & vec)450     sqrt_reference(const short_vec<int, 16>& vec) :
451         vec(vec)
452     {}
453 
454 private:
455     short_vec<int, 16> vec;
456 };
457 
458 #ifdef __ICC
459 #pragma warning pop
460 #endif
461 
462 inline
short_vec(const sqrt_reference<int,16> & other)463 short_vec<int, 16>::short_vec(const sqrt_reference<int, 16>& other) :
464     val1(
465         _mm_cvtps_epi32(
466             _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val1)))),
467     val2(
468         _mm_cvtps_epi32(
469             _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val2)))),
470     val3(
471         _mm_cvtps_epi32(
472             _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val3)))),
473     val4(
474         _mm_cvtps_epi32(
475             _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val4))))
476 {}
477 
478 inline
operator /=(const sqrt_reference<int,16> & other)479 void short_vec<int, 16>::operator/=(const sqrt_reference<int, 16>& other)
480 {
481     val1 = _mm_cvtps_epi32(
482         _mm_mul_ps(_mm_cvtepi32_ps(val1),
483                    _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val1))));
484     val2 = _mm_cvtps_epi32(
485         _mm_mul_ps(_mm_cvtepi32_ps(val2),
486                    _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val2))));
487     val3 = _mm_cvtps_epi32(
488         _mm_mul_ps(_mm_cvtepi32_ps(val3),
489                    _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val3))));
490     val4 = _mm_cvtps_epi32(
491         _mm_mul_ps(_mm_cvtepi32_ps(val4),
492                    _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val4))));
493 }
494 
495 inline
operator /(const sqrt_reference<int,16> & other) const496 short_vec<int, 16> short_vec<int, 16>::operator/(const sqrt_reference<int, 16>& other) const
497 {
498     return short_vec<int, 16>(
499         _mm_cvtps_epi32(
500             _mm_mul_ps(_mm_cvtepi32_ps(val1),
501                        _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val1)))),
502         _mm_cvtps_epi32(
503             _mm_mul_ps(_mm_cvtepi32_ps(val2),
504                        _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val2)))),
505         _mm_cvtps_epi32(
506             _mm_mul_ps(_mm_cvtepi32_ps(val3),
507                        _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val3)))),
508         _mm_cvtps_epi32(
509             _mm_mul_ps(_mm_cvtepi32_ps(val4),
510                        _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val4)))));
511 }
512 
513 inline
sqrt(const short_vec<int,16> & vec)514 sqrt_reference<int, 16> sqrt(const short_vec<int, 16>& vec)
515 {
516     return sqrt_reference<int, 16>(vec);
517 }
518 
519 template<typename _CharT, typename _Traits>
520 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<int,16> & vec)521 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
522            const short_vec<int, 16>& vec)
523 {
524     const int *data1 = reinterpret_cast<const int *>(&vec.val1);
525     const int *data2 = reinterpret_cast<const int *>(&vec.val2);
526     const int *data3 = reinterpret_cast<const int *>(&vec.val3);
527     const int *data4 = reinterpret_cast<const int *>(&vec.val4);
528     __os << "["
529          << data1[0] << ", " << data1[1]  << ", " << data1[2]  << ", " << data1[3] << ", "
530          << data2[0] << ", " << data2[1]  << ", " << data2[2]  << ", " << data2[3] << ", "
531          << data3[0] << ", " << data3[1]  << ", " << data3[2]  << ", " << data3[3] << ", "
532          << data4[0] << ", " << data4[1]  << ", " << data4[2]  << ", " << data4[3]
533          << "]";
534     return __os;
535 }
536 
537 }
538 
539 #endif
540 
541 #endif
542