1 /**
2 * Copyright 2016 Andreas Schäfer
3 * Copyright 2015 Kurt Kanzenbach
4 *
5 * Distributed under the Boost Software License, Version 1.0. (See accompanying
6 * file LICENSE or copy at http://www.boost.org/LICENSE_1_0.txt)
7 */
8
9 #ifndef FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_INT_16_HPP
10 #define FLAT_ARRAY_DETAIL_SHORT_VEC_SSE_INT_16_HPP
11
12 #if (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE) || \
13 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE2) || \
14 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_SSE4_1) || \
15 (LIBFLATARRAY_WIDEST_VECTOR_ISA == LIBFLATARRAY_AVX)
16
17 #include <emmintrin.h>
18 #include <libflatarray/detail/sqrt_reference.hpp>
19 #include <libflatarray/detail/short_vec_helpers.hpp>
20 #include <libflatarray/config.h>
21 #include <iostream>
22
23 #ifdef __SSE4_1__
24 #include <smmintrin.h>
25 #endif
26
27 #ifdef LIBFLATARRAY_WITH_CPP14
28 #include <initializer_list>
29 #endif
30
31 namespace LibFlatArray {
32
33 template<typename CARGO, int ARITY>
34 class short_vec;
35
36 template<typename CARGO, int ARITY>
37 class sqrt_reference;
38
39 #ifdef __ICC
40 // disabling this warning as implicit type conversion is exactly our goal here:
41 #pragma warning push
42 #pragma warning (disable: 2304)
43 #endif
44
45 template<>
46 class short_vec<int, 16>
47 {
48 public:
49 static const int ARITY = 16;
50
51 typedef short_vec_strategy::sse strategy;
52
53 template<typename _CharT, typename _Traits>
54 friend std::basic_ostream<_CharT, _Traits>& operator<<(
55 std::basic_ostream<_CharT, _Traits>& __os,
56 const short_vec<int, 16>& vec);
57
58 inline
short_vec(const int data=0)59 short_vec(const int data = 0) :
60 val1(_mm_set1_epi32(data)),
61 val2(_mm_set1_epi32(data)),
62 val3(_mm_set1_epi32(data)),
63 val4(_mm_set1_epi32(data))
64 {}
65
66 inline
short_vec(const int * data)67 short_vec(const int *data)
68 {
69 load(data);
70 }
71
72 inline
short_vec(const __m128i & val1,const __m128i & val2,const __m128i & val3,const __m128i & val4)73 short_vec(const __m128i& val1, const __m128i& val2,
74 const __m128i& val3, const __m128i& val4) :
75 val1(val1),
76 val2(val2),
77 val3(val3),
78 val4(val4)
79 {}
80
81 #ifdef LIBFLATARRAY_WITH_CPP14
82 inline
short_vec(const std::initializer_list<int> & il)83 short_vec(const std::initializer_list<int>& il)
84 {
85 const int *ptr = static_cast<const int *>(&(*il.begin()));
86 load(ptr);
87 }
88 #endif
89
90 inline
91 short_vec(const sqrt_reference<int, 16>& other);
92
93 inline
operator -=(const short_vec<int,16> & other)94 void operator-=(const short_vec<int, 16>& other)
95 {
96 val1 = _mm_sub_epi32(val1, other.val1);
97 val2 = _mm_sub_epi32(val2, other.val2);
98 val3 = _mm_sub_epi32(val3, other.val3);
99 val4 = _mm_sub_epi32(val4, other.val4);
100 }
101
102 inline
operator -(const short_vec<int,16> & other) const103 short_vec<int, 16> operator-(const short_vec<int, 16>& other) const
104 {
105 return short_vec<int, 16>(
106 _mm_sub_epi32(val1, other.val1),
107 _mm_sub_epi32(val2, other.val2),
108 _mm_sub_epi32(val3, other.val3),
109 _mm_sub_epi32(val4, other.val4));
110 }
111
112 inline
operator +=(const short_vec<int,16> & other)113 void operator+=(const short_vec<int, 16>& other)
114 {
115 val1 = _mm_add_epi32(val1, other.val1);
116 val2 = _mm_add_epi32(val2, other.val2);
117 val3 = _mm_add_epi32(val3, other.val3);
118 val4 = _mm_add_epi32(val4, other.val4);
119 }
120
121 inline
operator +(const short_vec<int,16> & other) const122 short_vec<int, 16> operator+(const short_vec<int, 16>& other) const
123 {
124 return short_vec<int, 16>(
125 _mm_add_epi32(val1, other.val1),
126 _mm_add_epi32(val2, other.val2),
127 _mm_add_epi32(val3, other.val3),
128 _mm_add_epi32(val4, other.val4));
129 }
130
131 #ifdef __SSE4_1__
132 inline
operator *=(const short_vec<int,16> & other)133 void operator*=(const short_vec<int, 16>& other)
134 {
135 val1 = _mm_mullo_epi32(val1, other.val1);
136 val2 = _mm_mullo_epi32(val2, other.val2);
137 val3 = _mm_mullo_epi32(val3, other.val3);
138 val4 = _mm_mullo_epi32(val4, other.val4);
139 }
140
141 inline
operator *(const short_vec<int,16> & other) const142 short_vec<int, 16> operator*(const short_vec<int, 16>& other) const
143 {
144 return short_vec<int, 16>(
145 _mm_mullo_epi32(val1, other.val1),
146 _mm_mullo_epi32(val2, other.val2),
147 _mm_mullo_epi32(val3, other.val3),
148 _mm_mullo_epi32(val4, other.val4));
149 }
150 #else
151 inline
operator *=(const short_vec<int,16> & other)152 void operator*=(const short_vec<int, 16>& other)
153 {
154 // see: https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
155 __m128i tmp1 = _mm_mul_epu32(val1, other.val1);
156 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(val1, 4),
157 _mm_srli_si128(other.val1, 4));
158 val1 = _mm_unpacklo_epi32(
159 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
160 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
161
162 tmp1 = _mm_mul_epu32(val2, other.val2);
163 tmp2 = _mm_mul_epu32(_mm_srli_si128(val2, 4),
164 _mm_srli_si128(other.val2, 4));
165 val2 = _mm_unpacklo_epi32(
166 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
167 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
168
169 tmp1 = _mm_mul_epu32(val3, other.val3);
170 tmp2 = _mm_mul_epu32(_mm_srli_si128(val3, 4),
171 _mm_srli_si128(other.val3, 4));
172 val3 = _mm_unpacklo_epi32(
173 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
174 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
175
176 tmp1 = _mm_mul_epu32(val4, other.val4);
177 tmp2 = _mm_mul_epu32(_mm_srli_si128(val4, 4),
178 _mm_srli_si128(other.val4, 4));
179 val4 = _mm_unpacklo_epi32(
180 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
181 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
182 }
183
184 inline
operator *(const short_vec<int,16> & other) const185 short_vec<int, 16> operator*(const short_vec<int, 16>& other) const
186 {
187 // see: https://software.intel.com/en-us/forums/intel-c-compiler/topic/288768
188 __m128i tmp1 = _mm_mul_epu32(val1, other.val1);
189 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(val1, 4),
190 _mm_srli_si128(other.val1, 4));
191 __m128i result1 = _mm_unpacklo_epi32(
192 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
193 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
194
195 tmp1 = _mm_mul_epu32(val2, other.val2);
196 tmp2 = _mm_mul_epu32(_mm_srli_si128(val2, 4),
197 _mm_srli_si128(other.val2, 4));
198 __m128i result2 = _mm_unpacklo_epi32(
199 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
200 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
201
202 tmp1 = _mm_mul_epu32(val3, other.val3);
203 tmp2 = _mm_mul_epu32(_mm_srli_si128(val3, 4),
204 _mm_srli_si128(other.val3, 4));
205 __m128i result3 = _mm_unpacklo_epi32(
206 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
207 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
208
209 tmp1 = _mm_mul_epu32(val4, other.val4);
210 tmp2 = _mm_mul_epu32(_mm_srli_si128(val4, 4),
211 _mm_srli_si128(other.val4, 4));
212 __m128i result4 = _mm_unpacklo_epi32(
213 _mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0,0,2,0)),
214 _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0,0,2,0)));
215
216 return short_vec<int, 16>(result1, result2, result3, result4);
217 }
218 #endif
219
220 inline
operator /=(const short_vec<int,16> & other)221 void operator/=(const short_vec<int, 16>& other)
222 {
223 val1 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val1),
224 _mm_cvtepi32_ps(other.val1)));
225 val2 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val2),
226 _mm_cvtepi32_ps(other.val2)));
227 val3 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val3),
228 _mm_cvtepi32_ps(other.val3)));
229 val4 = _mm_cvtps_epi32(_mm_div_ps(_mm_cvtepi32_ps(val4),
230 _mm_cvtepi32_ps(other.val4)));
231 }
232
233 inline
234 void operator/=(const sqrt_reference<int, 16>& other);
235
236 inline
operator /(const short_vec<int,16> & other) const237 short_vec<int, 16> operator/(const short_vec<int, 16>& other) const
238 {
239 return short_vec<int, 16>(
240 _mm_cvttps_epi32(_mm_div_ps(
241 _mm_cvtepi32_ps(val1),
242 _mm_cvtepi32_ps(other.val1))),
243 _mm_cvttps_epi32(_mm_div_ps(
244 _mm_cvtepi32_ps(val2),
245 _mm_cvtepi32_ps(other.val2))),
246 _mm_cvttps_epi32(_mm_div_ps(
247 _mm_cvtepi32_ps(val3),
248 _mm_cvtepi32_ps(other.val3))),
249 _mm_cvttps_epi32(_mm_div_ps(
250 _mm_cvtepi32_ps(val4),
251 _mm_cvtepi32_ps(other.val4))));
252 }
253
254 inline
255 short_vec<int, 16> operator/(const sqrt_reference<int, 16>& other) const;
256
257 inline
sqrt() const258 short_vec<int, 16> sqrt() const
259 {
260 return short_vec<int, 16>(
261 _mm_cvtps_epi32(
262 _mm_sqrt_ps(_mm_cvtepi32_ps(val1))),
263 _mm_cvtps_epi32(
264 _mm_sqrt_ps(_mm_cvtepi32_ps(val2))),
265 _mm_cvtps_epi32(
266 _mm_sqrt_ps(_mm_cvtepi32_ps(val3))),
267 _mm_cvtps_epi32(
268 _mm_sqrt_ps(_mm_cvtepi32_ps(val4))));
269 }
270
271 inline
load(const int * data)272 void load(const int *data)
273 {
274 val1 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + 0));
275 val2 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + 4));
276 val3 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + 8));
277 val4 = _mm_loadu_si128(reinterpret_cast<const __m128i *>(data + 12));
278 }
279
280 inline
load_aligned(const int * data)281 void load_aligned(const int *data)
282 {
283 SHORTVEC_ASSERT_ALIGNED(data, 16);
284 val1 = _mm_load_si128(reinterpret_cast<const __m128i *>(data + 0));
285 val2 = _mm_load_si128(reinterpret_cast<const __m128i *>(data + 4));
286 val3 = _mm_load_si128(reinterpret_cast<const __m128i *>(data + 8));
287 val4 = _mm_load_si128(reinterpret_cast<const __m128i *>(data + 12));
288 }
289
290 inline
store(int * data) const291 void store(int *data) const
292 {
293 _mm_storeu_si128(reinterpret_cast<__m128i *>(data + 0), val1);
294 _mm_storeu_si128(reinterpret_cast<__m128i *>(data + 4), val2);
295 _mm_storeu_si128(reinterpret_cast<__m128i *>(data + 8), val3);
296 _mm_storeu_si128(reinterpret_cast<__m128i *>(data + 12), val4);
297 }
298
299 inline
store_aligned(int * data) const300 void store_aligned(int *data) const
301 {
302 SHORTVEC_ASSERT_ALIGNED(data, 16);
303 _mm_store_si128(reinterpret_cast<__m128i *>(data + 0), val1);
304 _mm_store_si128(reinterpret_cast<__m128i *>(data + 4), val2);
305 _mm_store_si128(reinterpret_cast<__m128i *>(data + 8), val3);
306 _mm_store_si128(reinterpret_cast<__m128i *>(data + 12), val4);
307 }
308
309 inline
store_nt(int * data) const310 void store_nt(int *data) const
311 {
312 SHORTVEC_ASSERT_ALIGNED(data, 16);
313 _mm_stream_si128(reinterpret_cast<__m128i *>(data + 0), val1);
314 _mm_stream_si128(reinterpret_cast<__m128i *>(data + 4), val2);
315 _mm_stream_si128(reinterpret_cast<__m128i *>(data + 8), val3);
316 _mm_stream_si128(reinterpret_cast<__m128i *>(data + 12), val4);
317 }
318
319 #ifdef __SSE4_1__
320 inline
gather(const int * ptr,const int * offsets)321 void gather(const int *ptr, const int *offsets)
322 {
323 val1 = _mm_insert_epi32(val1, ptr[offsets[ 0]], 0);
324 val1 = _mm_insert_epi32(val1, ptr[offsets[ 1]], 1);
325 val1 = _mm_insert_epi32(val1, ptr[offsets[ 2]], 2);
326 val1 = _mm_insert_epi32(val1, ptr[offsets[ 3]], 3);
327
328 val2 = _mm_insert_epi32(val2, ptr[offsets[ 4]], 0);
329 val2 = _mm_insert_epi32(val2, ptr[offsets[ 5]], 1);
330 val2 = _mm_insert_epi32(val2, ptr[offsets[ 6]], 2);
331 val2 = _mm_insert_epi32(val2, ptr[offsets[ 7]], 3);
332
333 val3 = _mm_insert_epi32(val3, ptr[offsets[ 8]], 0);
334 val3 = _mm_insert_epi32(val3, ptr[offsets[ 9]], 1);
335 val3 = _mm_insert_epi32(val3, ptr[offsets[10]], 2);
336 val3 = _mm_insert_epi32(val3, ptr[offsets[11]], 3);
337
338 val4 = _mm_insert_epi32(val4, ptr[offsets[12]], 0);
339 val4 = _mm_insert_epi32(val4, ptr[offsets[13]], 1);
340 val4 = _mm_insert_epi32(val4, ptr[offsets[14]], 2);
341 val4 = _mm_insert_epi32(val4, ptr[offsets[15]], 3);
342 }
343
344 inline
scatter(int * ptr,const int * offsets) const345 void scatter(int *ptr, const int *offsets) const
346 {
347 ptr[offsets[ 0]] = _mm_extract_epi32(val1, 0);
348 ptr[offsets[ 1]] = _mm_extract_epi32(val1, 1);
349 ptr[offsets[ 2]] = _mm_extract_epi32(val1, 2);
350 ptr[offsets[ 3]] = _mm_extract_epi32(val1, 3);
351
352 ptr[offsets[ 4]] = _mm_extract_epi32(val2, 0);
353 ptr[offsets[ 5]] = _mm_extract_epi32(val2, 1);
354 ptr[offsets[ 6]] = _mm_extract_epi32(val2, 2);
355 ptr[offsets[ 7]] = _mm_extract_epi32(val2, 3);
356
357 ptr[offsets[ 8]] = _mm_extract_epi32(val3, 0);
358 ptr[offsets[ 9]] = _mm_extract_epi32(val3, 1);
359 ptr[offsets[10]] = _mm_extract_epi32(val3, 2);
360 ptr[offsets[11]] = _mm_extract_epi32(val3, 3);
361
362 ptr[offsets[12]] = _mm_extract_epi32(val4, 0);
363 ptr[offsets[13]] = _mm_extract_epi32(val4, 1);
364 ptr[offsets[14]] = _mm_extract_epi32(val4, 2);
365 ptr[offsets[15]] = _mm_extract_epi32(val4, 3);
366 }
367 #else
368 inline
gather(const int * ptr,const int * offsets)369 void gather(const int *ptr, const int *offsets)
370 {
371 __m128i i2, i3, i4;
372 val1 = _mm_cvtsi32_si128(ptr[offsets[0]]);
373 i2 = _mm_cvtsi32_si128(ptr[offsets[1]]);
374 i3 = _mm_cvtsi32_si128(ptr[offsets[2]]);
375 i4 = _mm_cvtsi32_si128(ptr[offsets[3]]);
376 val1 = _mm_unpacklo_epi32(val1, i3);
377 i3 = _mm_unpacklo_epi32(i2 , i4);
378 val1 = _mm_unpacklo_epi32(val1, i3);
379
380 val2 = _mm_cvtsi32_si128(ptr[offsets[4]]);
381 i2 = _mm_cvtsi32_si128(ptr[offsets[5]]);
382 i3 = _mm_cvtsi32_si128(ptr[offsets[6]]);
383 i4 = _mm_cvtsi32_si128(ptr[offsets[7]]);
384 val2 = _mm_unpacklo_epi32(val2, i3);
385 i3 = _mm_unpacklo_epi32(i2 , i4);
386 val2 = _mm_unpacklo_epi32(val2, i3);
387
388 val3 = _mm_cvtsi32_si128(ptr[offsets[ 8]]);
389 i2 = _mm_cvtsi32_si128(ptr[offsets[ 9]]);
390 i3 = _mm_cvtsi32_si128(ptr[offsets[10]]);
391 i4 = _mm_cvtsi32_si128(ptr[offsets[11]]);
392 val3 = _mm_unpacklo_epi32(val3, i3);
393 i3 = _mm_unpacklo_epi32(i2 , i4);
394 val3 = _mm_unpacklo_epi32(val3, i3);
395
396 val4 = _mm_cvtsi32_si128(ptr[offsets[12]]);
397 i2 = _mm_cvtsi32_si128(ptr[offsets[13]]);
398 i3 = _mm_cvtsi32_si128(ptr[offsets[14]]);
399 i4 = _mm_cvtsi32_si128(ptr[offsets[15]]);
400 val4 = _mm_unpacklo_epi32(val4, i3);
401 i3 = _mm_unpacklo_epi32(i2 , i4);
402 val4 = _mm_unpacklo_epi32(val4, i3);
403 }
404
405 inline
scatter(int * ptr,const int * offsets) const406 void scatter(int *ptr, const int *offsets) const
407 {
408 ptr[offsets[ 0]] = _mm_cvtsi128_si32(val1);
409 ptr[offsets[ 1]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(0,3,2,1)));
410 ptr[offsets[ 2]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(1,0,3,2)));
411 ptr[offsets[ 3]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val1, _MM_SHUFFLE(2,1,0,3)));
412
413 ptr[offsets[ 4]] = _mm_cvtsi128_si32(val2);
414 ptr[offsets[ 5]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(0,3,2,1)));
415 ptr[offsets[ 6]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(1,0,3,2)));
416 ptr[offsets[ 7]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val2, _MM_SHUFFLE(2,1,0,3)));
417
418 ptr[offsets[ 8]] = _mm_cvtsi128_si32(val3);
419 ptr[offsets[ 9]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(0,3,2,1)));
420 ptr[offsets[10]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(1,0,3,2)));
421 ptr[offsets[11]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val3, _MM_SHUFFLE(2,1,0,3)));
422
423 ptr[offsets[12]] = _mm_cvtsi128_si32(val4);
424 ptr[offsets[13]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(0,3,2,1)));
425 ptr[offsets[14]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(1,0,3,2)));
426 ptr[offsets[15]] = _mm_cvtsi128_si32(_mm_shuffle_epi32(val4, _MM_SHUFFLE(2,1,0,3)));
427 }
428 #endif
429
430 private:
431 __m128i val1;
432 __m128i val2;
433 __m128i val3;
434 __m128i val4;
435 };
436
437 inline
operator <<(int * data,const short_vec<int,16> & vec)438 void operator<<(int *data, const short_vec<int, 16>& vec)
439 {
440 vec.store(data);
441 }
442
443 template<>
444 class sqrt_reference<int, 16>
445 {
446 public:
447 template<typename OTHER_CARGO, int OTHER_ARITY>
448 friend class short_vec;
449
sqrt_reference(const short_vec<int,16> & vec)450 sqrt_reference(const short_vec<int, 16>& vec) :
451 vec(vec)
452 {}
453
454 private:
455 short_vec<int, 16> vec;
456 };
457
458 #ifdef __ICC
459 #pragma warning pop
460 #endif
461
462 inline
short_vec(const sqrt_reference<int,16> & other)463 short_vec<int, 16>::short_vec(const sqrt_reference<int, 16>& other) :
464 val1(
465 _mm_cvtps_epi32(
466 _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val1)))),
467 val2(
468 _mm_cvtps_epi32(
469 _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val2)))),
470 val3(
471 _mm_cvtps_epi32(
472 _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val3)))),
473 val4(
474 _mm_cvtps_epi32(
475 _mm_sqrt_ps(_mm_cvtepi32_ps(other.vec.val4))))
476 {}
477
478 inline
operator /=(const sqrt_reference<int,16> & other)479 void short_vec<int, 16>::operator/=(const sqrt_reference<int, 16>& other)
480 {
481 val1 = _mm_cvtps_epi32(
482 _mm_mul_ps(_mm_cvtepi32_ps(val1),
483 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val1))));
484 val2 = _mm_cvtps_epi32(
485 _mm_mul_ps(_mm_cvtepi32_ps(val2),
486 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val2))));
487 val3 = _mm_cvtps_epi32(
488 _mm_mul_ps(_mm_cvtepi32_ps(val3),
489 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val3))));
490 val4 = _mm_cvtps_epi32(
491 _mm_mul_ps(_mm_cvtepi32_ps(val4),
492 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val4))));
493 }
494
495 inline
operator /(const sqrt_reference<int,16> & other) const496 short_vec<int, 16> short_vec<int, 16>::operator/(const sqrt_reference<int, 16>& other) const
497 {
498 return short_vec<int, 16>(
499 _mm_cvtps_epi32(
500 _mm_mul_ps(_mm_cvtepi32_ps(val1),
501 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val1)))),
502 _mm_cvtps_epi32(
503 _mm_mul_ps(_mm_cvtepi32_ps(val2),
504 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val2)))),
505 _mm_cvtps_epi32(
506 _mm_mul_ps(_mm_cvtepi32_ps(val3),
507 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val3)))),
508 _mm_cvtps_epi32(
509 _mm_mul_ps(_mm_cvtepi32_ps(val4),
510 _mm_rsqrt_ps(_mm_cvtepi32_ps(other.vec.val4)))));
511 }
512
513 inline
sqrt(const short_vec<int,16> & vec)514 sqrt_reference<int, 16> sqrt(const short_vec<int, 16>& vec)
515 {
516 return sqrt_reference<int, 16>(vec);
517 }
518
519 template<typename _CharT, typename _Traits>
520 std::basic_ostream<_CharT, _Traits>&
operator <<(std::basic_ostream<_CharT,_Traits> & __os,const short_vec<int,16> & vec)521 operator<<(std::basic_ostream<_CharT, _Traits>& __os,
522 const short_vec<int, 16>& vec)
523 {
524 const int *data1 = reinterpret_cast<const int *>(&vec.val1);
525 const int *data2 = reinterpret_cast<const int *>(&vec.val2);
526 const int *data3 = reinterpret_cast<const int *>(&vec.val3);
527 const int *data4 = reinterpret_cast<const int *>(&vec.val4);
528 __os << "["
529 << data1[0] << ", " << data1[1] << ", " << data1[2] << ", " << data1[3] << ", "
530 << data2[0] << ", " << data2[1] << ", " << data2[2] << ", " << data2[3] << ", "
531 << data3[0] << ", " << data3[1] << ", " << data3[2] << ", " << data3[3] << ", "
532 << data4[0] << ", " << data4[1] << ", " << data4[2] << ", " << data4[3]
533 << "]";
534 return __os;
535 }
536
537 }
538
539 #endif
540
541 #endif
542