1 /*  Copyright (C) 2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
9 #define LIBSIMDPP_SIMDPP_CORE_I_SHIFT_R_V_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/null/math.h>
17 #include <simdpp/detail/insn/i_shift.h>
18 #include <simdpp/detail/shuffle/shuffle_mask.h>
19 #include <simdpp/core/i_neg.h>
20 #include <simdpp/core/i_mul.h>
21 #include <simdpp/core/permute_bytes16.h>
22 #include <simdpp/detail/vector_array_macros.h>
23 
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28 
29 // emulates 8-bit variable shift using 16-bit variable shift
30 template<class U8> SIMDPP_INL
v_emul_shift_r_u8_using_v16(const U8 & a,const U8 & count)31 U8 v_emul_shift_r_u8_using_v16(const U8& a, const U8& count)
32 {
33     using U16 = typename same_width<U8>::u16;
34 
35     U16 a16; a16 = a;
36     U16 c16; c16 = count;
37 
38     U16 select_mask = make_uint(0x00ff);
39     U16 a_lo = bit_and(a16, select_mask);
40     U16 a_hi = a16;
41     U16 c_lo = bit_and(c16, select_mask);
42     U16 c_hi = shift_r<8>(c16);
43     a_lo = shift_r(a_lo, c_lo);
44     a_hi = shift_r(a_hi, c_hi);
45     a_hi = bit_andnot(a_hi, select_mask);
46 
47     a16 = bit_or(a_lo, a_hi);
48     return (U8) a16;
49 }
50 
51 // emulates 8-bit variable shift using permute_bytes16 and 16-bit multiplication
52 template<class U8> SIMDPP_INL
v_emul_shift_r_u8_using_mul(const U8 & a,const U8 & count)53 U8 v_emul_shift_r_u8_using_mul(const U8& a, const U8& count)
54 {
55     using U16 = typename same_width<U8>::u16;
56 
57     // Variable shift is implemented by reusing shifter in 16-bit unsigned
58     // multiplication. The result is obtained by computing 1 << (8-countN)
59     // for each element from a, multiplying each element by that number and
60     // selecting the high half of the result.
61     U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10,
62                                  0x08, 0x04, 0x02, 0x01,
63                                  0x00, 0x00, 0x00, 0x00,
64                                  0x00, 0x00, 0x00, 0x00);
65     U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
66     U16 a16; a16 = a;
67     U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
68     U16 select_mask = make_uint(0x00ff);
69 
70     // Move the element values to the high byte of the 16-bit elements and the
71     // shift values to the low 9 bits. The 9-th bit is needed because in order
72     // to shift by 0 the element values need to be multiplied by 0x100.
73     // The results will have the high byte clear which will help composing the
74     // result back to a single vector.
75     a16_lo = shift_l<8>(a16);
76     mulshift_lo = bit_and(mulshift, select_mask);
77     mulshift_lo = shift_l<1>(mulshift_lo);
78     a16_hi = bit_andnot(a16, select_mask);
79     mulshift_hi = shift_l<1>(shift_r<8>(mulshift));
80 
81     a16_lo = mul_hi(a16_lo, mulshift_lo);
82     a16_hi = mul_hi(a16_hi, mulshift_hi);
83 
84     a16_hi = shift_l<8>(a16_hi);
85     a16 = bit_or(a16_lo, a16_hi);
86     return (U8) a16;
87 }
88 
89 static SIMDPP_INL
i_shift_r_v(const uint8<16> & a,const uint8<16> & count)90 uint8<16> i_shift_r_v(const uint8<16>& a, const uint8<16>& count)
91 {
92 #if SIMDPP_USE_NULL
93     return detail::null::shift_r_v(a, count);
94 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
95     return v_emul_shift_r_u8_using_v16(a, count);
96 #elif SIMDPP_USE_SSSE3
97     return v_emul_shift_r_u8_using_mul(a, count);
98 #elif SIMDPP_USE_NEON
99     int8<16> qcount = neg((int8<16>)count);
100     return vshlq_u8(a.native(), qcount.native());
101 #elif SIMDPP_USE_ALTIVEC
102     return vec_sr(a.native(), count.native());
103 #elif SIMDPP_USE_MSA
104     return (v16u8) __msa_srl_b((v16i8)a.native(), (v16i8)count.native());
105 #else
106     return SIMDPP_NOT_IMPLEMENTED2(a, count);
107 #endif
108 }
109 
110 #if SIMDPP_USE_AVX2
111 static SIMDPP_INL
i_shift_r_v(const uint8<32> & a,const uint8<32> & count)112 uint8<32> i_shift_r_v(const uint8<32>& a, const uint8<32>& count)
113 {
114 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
115     return v_emul_shift_r_u8_using_v16(a, count);
116 #else
117     return v_emul_shift_r_u8_using_mul(a, count);
118 #endif
119 }
120 #endif
121 
122 #if SIMDPP_USE_AVX512BW
123 static SIMDPP_INL
i_shift_r_v(const uint8<64> & a,const uint8<64> & count)124 uint8<64> i_shift_r_v(const uint8<64>& a, const uint8<64>& count)
125 {
126     return v_emul_shift_r_u8_using_v16(a, count);
127 }
128 #endif
129 
130 // -----------------------------------------------------------------------------
131 
132 // emulates 8-bit variable shift using 16-bit variable shift
133 template<class I8, class U8> SIMDPP_INL
v_emul_shift_r_i8_using_v16(const I8 & a,const U8 & count)134 I8 v_emul_shift_r_i8_using_v16(const I8& a, const U8& count)
135 {
136     using I16 = typename same_width<I8>::i16;
137     using U16 = typename same_width<I8>::u16;
138 
139     U16 a16; a16 = a;
140     U16 c16; c16 = count;
141 
142     U16 select_mask = make_uint(0x00ff);
143     U16 a_lo = shift_l<8>(a16);
144     U16 a_hi = a16;
145     U16 c_lo = bit_and(c16, select_mask);
146     U16 c_hi = shift_r<8>(c16);
147     a_lo = shift_r((I16)a_lo, c_lo);
148     a_hi = shift_r((I16)a_hi, c_hi);
149 
150     a_lo = shift_r<8>(a_lo);
151     a_hi = bit_andnot(a_hi, select_mask);
152     a16 = bit_or(a_lo, a_hi);
153 
154     return (I8) a16;
155 }
156 
157 template<class I8, class U8> SIMDPP_INL
v_emul_shift_r_i8_using_mul(const I8 & a,const U8 & count)158 I8 v_emul_shift_r_i8_using_mul(const I8& a, const U8& count)
159 {
160     using U16 = typename same_width<U8>::u16;
161     using I16 = typename same_width<U8>::i16;
162 
163     // Variable shift is implemented by reusing shifter in 16-bit signed
164     // multiplication. The result is obtained by computing 1 << (8-countN)
165     // for each element from a, multiplying each element by that number and
166     // selecting the high half of the result.
167     U8 mulshift_mask = make_uint(0x80, 0x40, 0x20, 0x10,
168                                  0x08, 0x04, 0x02, 0x01,
169                                  0x00, 0x00, 0x00, 0x00,
170                                  0x00, 0x00, 0x00, 0x00);
171     U16 mulshift = (U16) permute_bytes16(mulshift_mask, count);
172     U16 a16; a16 = a;
173     U16 a16_lo, a16_hi, mulshift_lo, mulshift_hi;
174     U16 select_mask = make_uint(0x00ff);
175 
176     // Move the element values to the high byte of the 16-bit elements and the
177     // shift values to the low 9 bits. The 9-th bit is needed because in order
178     // to shift by 0 the element values need to be multiplied by 0x100.
179     // Note that the results may have nonzero high byte because this is signed
180     // multiplication.
181     a16_lo = shift_l<8>(a16);
182     mulshift_lo = bit_and(mulshift, select_mask);
183     mulshift_lo = shift_l<1>(mulshift_lo);
184     a16_hi = bit_andnot(a16, select_mask);
185     mulshift_hi = shift_l<1>(shift_r<8>(mulshift));
186 
187     a16_lo = mul_hi((I16)a16_lo, (I16)mulshift_lo);
188     a16_hi = mul_hi((I16)a16_hi, (I16)mulshift_hi);
189 
190     a16_hi = shift_l<8>(a16_hi);
191     a16_lo = bit_and(a16_lo, select_mask);
192     a16 = bit_or(a16_lo, a16_hi);
193     return (U8) a16;
194 }
195 
196 static SIMDPP_INL
i_shift_r_v(const int8<16> & a,const uint8<16> & count)197 int8<16> i_shift_r_v(const int8<16>& a, const uint8<16>& count)
198 {
199 #if SIMDPP_USE_NULL
200     return detail::null::shift_r_v(a, count);
201 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
202     return v_emul_shift_r_i8_using_v16(a, count);
203 #elif SIMDPP_USE_SSSE3
204     return v_emul_shift_r_i8_using_mul(a, count);
205 #elif SIMDPP_USE_NEON
206     int8<16> qcount = neg((int8<16>)count);
207     return vshlq_s8(a.native(), qcount.native());
208 #elif SIMDPP_USE_ALTIVEC
209     return vec_sra(a.native(), count.native());
210 #elif SIMDPP_USE_MSA
211     return __msa_sra_b(a.native(), (v16i8) count.native());
212 #else
213     return SIMDPP_NOT_IMPLEMENTED2(a, count);
214 #endif
215 }
216 
217 #if SIMDPP_USE_AVX2
218 static SIMDPP_INL
i_shift_r_v(const int8<32> & a,const uint8<32> & count)219 int8<32> i_shift_r_v(const int8<32>& a, const uint8<32>& count)
220 {
221 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
222     return v_emul_shift_r_i8_using_v16(a, count);
223 #else
224     return v_emul_shift_r_i8_using_mul(a, count);
225 #endif
226 }
227 #endif
228 
229 #if SIMDPP_USE_AVX512BW
230 static SIMDPP_INL
i_shift_r_v(const int8<64> & a,const uint8<64> & count)231 int8<64> i_shift_r_v(const int8<64>& a, const uint8<64>& count)
232 {
233     return v_emul_shift_r_i8_using_v16(a, count);
234 }
235 #endif
236 
237 // -----------------------------------------------------------------------------
238 
239 // emulates 16-bit variable shift using permute_bytes16 and 16-bit multiplication
240 template<class U16>
v_emul_shift_r_u16_using_mul(const U16 & a,const U16 & count)241 U16 v_emul_shift_r_u16_using_mul(const U16& a, const U16& count)
242 {
243     using U8 = typename same_width<U16>::u8;
244     using M16 = typename U16::mask_vector_type;
245     // Variable shift is implemented by reusing shifter in 16-bit unsigned
246     // multiplication. The result is obtained by computing 1 << (16-countN-1)
247     // for each element from a, multiplying each element by that number and
248     // selecting the high half of the result. Note that the highest shift
249     // available when using 16-bit multiplication is 15, which needs to be
250     // worked around by extra instructions.
251     M16 is_same = cmp_eq(count, 0);
252     M16 is_zero = cmp_gt(count, 15);
253 
254     U8 mulshift_mask = make_uint(0x00, 0x80, 0x40, 0x20,
255                                  0x10, 0x08, 0x04, 0x02,
256                                  0x01, 0x00, 0x00, 0x00,
257                                  0x00, 0x00, 0x00, 0x00);
258 
259     // permute_bytes16 permutes 8-bit elements instead of 16 which would be
260     // optimal in this case. We need to construct the selector in special way
261     // for 8-bit permutation.
262     // The 4-th is toggled bit so that the high byte takes zeros from the
263     // mulshift mask when the shift count is higher than 8.
264     U16 qcount = bit_or(count, shift_l<8>(count));
265     qcount = bit_xor(qcount, 0x0008);
266 
267     U16 mulshift = (U16) permute_bytes16(mulshift_mask, (U8) qcount);
268     U16 res = mul_hi(a, mulshift);
269     res = blend(a, res, is_same);
270     res = bit_andnot(res, is_zero);
271     return res;
272 }
273 
274 static SIMDPP_INL
i_shift_r_v(const uint16<8> & a,const uint16<8> & count)275 uint16<8> i_shift_r_v(const uint16<8>& a, const uint16<8>& count)
276 {
277 #if SIMDPP_USE_NULL
278     return detail::null::shift_r_v(a, count);
279 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
280     return _mm_srlv_epi16(a.native(), count.native());
281 #elif SIMDPP_USE_SSSE3
282     return v_emul_shift_r_u16_using_mul(a, count);
283 #elif SIMDPP_USE_NEON
284     int16<8> qcount = neg((int16<8>)count);
285     return vshlq_u16(a.native(), qcount.native());
286 #elif SIMDPP_USE_ALTIVEC
287     return vec_sr(a.native(), count.native());
288 #elif SIMDPP_USE_MSA
289     return (v8u16) __msa_srl_h((v8i16)a.native(), (v8i16)count.native());
290 #else
291     return SIMDPP_NOT_IMPLEMENTED2(a, count);
292 #endif
293 }
294 
295 #if SIMDPP_USE_AVX2
296 static SIMDPP_INL
i_shift_r_v(const uint16<16> & a,const uint16<16> & count)297 uint16<16> i_shift_r_v(const uint16<16>& a, const uint16<16>& count)
298 {
299 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
300     return _mm256_srlv_epi16(a.native(), count.native());
301 #else
302     return v_emul_shift_r_u16_using_mul(a, count);
303 #endif
304 }
305 #endif
306 
307 #if SIMDPP_USE_AVX512BW
i_shift_r_v(const uint16<32> & a,const uint16<32> & count)308 SIMDPP_INL uint16<32> i_shift_r_v(const uint16<32>& a, const uint16<32>& count)
309 {
310     return _mm512_srlv_epi16(a.native(), count.native());
311 }
312 #endif
313 
314 // -----------------------------------------------------------------------------
315 
316 static SIMDPP_INL
i_shift_r_v(const int16<8> & a,const uint16<8> & count)317 int16<8> i_shift_r_v(const int16<8>& a, const uint16<8>& count)
318 {
319 #if SIMDPP_USE_NULL
320     return detail::null::shift_r_v(a, count);
321 #elif SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
322     return _mm_srav_epi16(a.native(), count.native());
323 #elif SIMDPP_USE_AVX512BW
324     __m512i a512 = _mm512_castsi128_si512(a.native());
325     __m512i count512 = _mm512_castsi128_si512(count.native());
326     return _mm512_castsi512_si128(_mm512_srav_epi16(a512, count512));
327 #elif SIMDPP_USE_NEON
328     int16<8> qcount = neg((int16<8>)count);
329     return vshlq_s16(a.native(), qcount.native());
330 #elif SIMDPP_USE_ALTIVEC
331     return vec_sra(a.native(), count.native());
332 #elif SIMDPP_USE_MSA
333     return __msa_sra_h(a.native(), (v8i16) count.native());
334 #else
335     return SIMDPP_NOT_IMPLEMENTED2(a, count);
336 #endif
337 }
338 
339 #if SIMDPP_USE_AVX2
340 static SIMDPP_INL
i_shift_r_v(const int16<16> & a,const uint16<16> & count)341 int16<16> i_shift_r_v(const int16<16>& a, const uint16<16>& count)
342 {
343 #if SIMDPP_USE_AVX512BW && SIMDPP_USE_AVX512VL
344     return _mm256_srav_epi16(a.native(), count.native());
345 #elif SIMDPP_USE_AVX512BW
346     __m512i a512 = _mm512_castsi256_si512(a.native());
347     __m512i count512 = _mm512_castsi256_si512(count.native());
348     return _mm512_castsi512_si256(_mm512_srav_epi16(a512, count512));
349 #else
350     return SIMDPP_NOT_IMPLEMENTED2(a, count);
351 #endif
352 }
353 #endif
354 
355 #if SIMDPP_USE_AVX512BW
i_shift_r_v(const int16<32> & a,const uint16<32> & count)356 SIMDPP_INL int16<32> i_shift_r_v(const int16<32>& a, const uint16<32>& count)
357 {
358     return _mm512_srav_epi16(a.native(), count.native());
359 }
360 #endif
361 
362 // -----------------------------------------------------------------------------
363 
364 static SIMDPP_INL
i_shift_r_v(const uint32<4> & a,const uint32<4> & count)365 uint32<4> i_shift_r_v(const uint32<4>& a, const uint32<4>& count)
366 {
367 #if SIMDPP_USE_NULL
368     return detail::null::shift_r_v(a, count);
369 #elif SIMDPP_USE_AVX2
370     return _mm_srlv_epi32(a.native(), count.native());
371 #elif SIMDPP_USE_SSE2
372     uint32<4> count0 = count;
373 #if SIMDPP_USE_SSE4_1
374     uint32<4> zero = make_zero();
375     count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
376 #else
377     uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
378     count0 = bit_and(count0, mask);
379 #endif
380     uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
381     uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
382     uint32<4> count3 = _mm_srli_si128(count.native(), 12);
383 
384     __m128i a0 = _mm_srl_epi32(a.native(), count0.native());
385     __m128i a1 = _mm_srl_epi32(a.native(), count1.native());
386     __m128i a2 = _mm_srl_epi32(a.native(), count2.native());
387     __m128i a3 = _mm_srl_epi32(a.native(), count3.native());
388 #if SIMDPP_USE_SSE4_1
389     a0 = _mm_blend_epi16(a0, a1, 0x0c);
390     a2 = _mm_blend_epi16(a2, a3, 0xc0);
391     a0 = _mm_blend_epi16(a0, a2, 0xf0);
392 #else
393     __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
394                                _mm_castsi128_ps(a1),
395                                SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
396     __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
397                                _mm_castsi128_ps(a3),
398                                SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
399     f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
400     a0 = _mm_castps_si128(f0);
401 #endif
402     return a0;
403 #elif SIMDPP_USE_NEON
404     int32<4> qcount = neg((int32<4>)count);
405     return vshlq_u32(a.native(), qcount.native());
406 #elif SIMDPP_USE_ALTIVEC
407     return vec_sr(a.native(), count.native());
408 #elif SIMDPP_USE_MSA
409     return (v4u32) __msa_srl_w((v4i32)a.native(), (v4i32)count.native());
410 #endif
411 }
412 
413 #if SIMDPP_USE_AVX2
414 static SIMDPP_INL
i_shift_r_v(const uint32<8> & a,const uint32<8> & count)415 uint32<8> i_shift_r_v(const uint32<8>& a, const uint32<8>& count)
416 {
417     return _mm256_srlv_epi32(a.native(), count.native());
418 }
419 #endif
420 
421 #if SIMDPP_USE_AVX512F
422 static SIMDPP_INL
i_shift_r_v(const uint32<16> & a,const uint32<16> & count)423 uint32<16> i_shift_r_v(const uint32<16>& a, const uint32<16>& count)
424 {
425     return _mm512_srlv_epi32(a.native(), count.native());
426 }
427 #endif
428 
429 // -----------------------------------------------------------------------------
430 
431 static SIMDPP_INL
i_shift_r_v(const int32<4> & a,const uint32<4> & count)432 int32<4> i_shift_r_v(const int32<4>& a, const uint32<4>& count)
433 {
434 #if SIMDPP_USE_NULL
435     return detail::null::shift_r_v(a, count);
436 #elif SIMDPP_USE_AVX2
437     return _mm_srav_epi32(a.native(), count.native());
438 #elif SIMDPP_USE_SSE2
439     uint32<4> count0 = count;
440 #if SIMDPP_USE_SSE4_1
441     uint32<4> zero = make_zero();
442     count0 = _mm_blend_epi16(count0.native(), zero.native(), 0xcc);
443 #else
444     uint32<4> mask = make_uint(0xffffffff, 0, 0xffffffff, 0);
445     count0 = bit_and(count0, mask);
446 #endif
447     uint32<4> count1 = _mm_srli_epi64(count.native(), 32);
448     uint32<4> count2 = _mm_srli_si128(count0.native(), 8);
449     uint32<4> count3 = _mm_srli_si128(count.native(), 12);
450 
451     __m128i a0 = _mm_sra_epi32(a.native(), count0.native());
452     __m128i a1 = _mm_sra_epi32(a.native(), count1.native());
453     __m128i a2 = _mm_sra_epi32(a.native(), count2.native());
454     __m128i a3 = _mm_sra_epi32(a.native(), count3.native());
455 #if SIMDPP_USE_SSE4_1
456     a0 = _mm_blend_epi16(a0, a1, 0x0c);
457     a2 = _mm_blend_epi16(a2, a3, 0xc0);
458     a0 = _mm_blend_epi16(a0, a2, 0xf0);
459 #else
460     __m128 f0 = _mm_shuffle_ps(_mm_castsi128_ps(a0),
461                                _mm_castsi128_ps(a1),
462                                SIMDPP_SHUFFLE_MASK_4x4(0, 0, 1, 1));
463     __m128 f1 = _mm_shuffle_ps(_mm_castsi128_ps(a2),
464                                _mm_castsi128_ps(a3),
465                                SIMDPP_SHUFFLE_MASK_4x4(2, 2, 3, 3));
466     f0 = _mm_shuffle_ps(f0, f1, SIMDPP_SHUFFLE_MASK_4x4(0, 2, 0, 2));
467     a0 = _mm_castps_si128(f0);
468 #endif
469     return a0;
470 #elif SIMDPP_USE_NEON
471     int32<4> qcount = neg((int32<4>)count);
472     return vshlq_s32(a.native(), qcount.native());
473 #elif SIMDPP_USE_ALTIVEC
474     return vec_sra(a.native(), count.native());
475 #elif SIMDPP_USE_MSA
476     return __msa_sra_w(a.native(), (v4i32)count.native());
477 #endif
478 }
479 
480 #if SIMDPP_USE_AVX2
481 static SIMDPP_INL
i_shift_r_v(const int32<8> & a,const uint32<8> & count)482 int32<8> i_shift_r_v(const int32<8>& a, const uint32<8>& count)
483 {
484     return _mm256_srav_epi32(a.native(), count.native());
485 }
486 #endif
487 
488 #if SIMDPP_USE_AVX512F
489 static SIMDPP_INL
i_shift_r_v(const int32<16> & a,const uint32<16> & count)490 int32<16> i_shift_r_v(const int32<16>& a, const uint32<16>& count)
491 {
492     return _mm512_srav_epi32(a.native(), count.native());
493 }
494 #endif
495 
496 // -----------------------------------------------------------------------------
497 
498 template<class V, class U> SIMDPP_INL
i_shift_r_v(const V & a,const U & b)499 V i_shift_r_v(const V& a, const U& b)
500 {
501     SIMDPP_VEC_ARRAY_IMPL2(V, i_shift_r_v, a, b);
502 }
503 
504 } // namespace insn
505 } // namespace detail
506 } // namespace SIMDPP_ARCH_NAMESPACE
507 } // namespace simdpp
508 
509 #endif
510