1 /***************************************************************************
2 * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
3 * Martin Renou                                                             *
4 * Copyright (c) QuantStack                                                 *
5 *                                                                          *
6 * Distributed under the terms of the BSD 3-Clause License.                 *
7 *                                                                          *
8 * The full license is in the file LICENSE, distributed with this software. *
9 ****************************************************************************/
10 
11 #ifndef XSIMD_AVX512_INT16_HPP
12 #define XSIMD_AVX512_INT16_HPP
13 
14 #include "xsimd_avx512_bool.hpp"
15 #include "xsimd_avx512_int_base.hpp"
16 
17 namespace xsimd
18 {
19 
20 #define XSIMD_APPLY_AVX2_FUNCTION_INT16(func, avx_lhs, avx_rhs) \
21     XSIMD_APPLY_AVX2_FUNCTION(16, func, avx_lhs, avx_rhs)
22 
23     /***************************
24      * batch_bool<int16_t, 32> *
25      ***************************/
26 
27     template <>
28     struct simd_batch_traits<batch_bool<int16_t, 32>>
29     {
30         using value_type = int16_t;
31         static constexpr std::size_t size = 32;
32         using batch_type = batch<int16_t, 32>;
33         static constexpr std::size_t align = 64;
34     };
35 
36     template <>
37     struct simd_batch_traits<batch_bool<uint16_t, 32>>
38     {
39         using value_type = uint16_t;
40         static constexpr std::size_t size = 32;
41         using batch_type = batch<uint16_t, 32>;
42         static constexpr std::size_t align = 64;
43     };
44 
45 #if defined(XSIMD_AVX512BW_AVAILABLE)
46 
47     template <>
48     class batch_bool<int16_t, 32> :
49         public batch_bool_avx512<__mmask32, batch_bool<int16_t, 32>>
50     {
51     public:
52 
53         using base_class = batch_bool_avx512<__mmask32, batch_bool<int16_t, 32>>;
54         using base_class::base_class;
55     };
56 
57     template <>
58     class batch_bool<uint16_t, 32> :
59         public batch_bool_avx512<__mmask32, batch_bool<uint16_t, 32>>
60     {
61     public:
62 
63         using base_class = batch_bool_avx512<__mmask32, batch_bool<uint16_t, 32>>;
64         using base_class::base_class;
65     };
66 
67     namespace detail
68     {
69         template <>
70         struct batch_bool_kernel<int16_t, 32>
71             : batch_bool_kernel_avx512<int16_t, 32>
72         {
73         };
74 
75         template <>
76         struct batch_bool_kernel<uint16_t, 32>
77             : batch_bool_kernel_avx512<uint16_t, 32>
78         {
79         };
80     }
81 
82 #else
83 
84     template <>
85     class batch_bool<int16_t, 32> : public avx512_fallback_batch_bool<int16_t, 32>
86     {
87     public:
88 
89         using base_class = avx512_fallback_batch_bool<int16_t, 32>;
90         using base_class::base_class;
91     };
92 
93     template <>
94     class batch_bool<uint16_t, 32> : public avx512_fallback_batch_bool<uint16_t, 32>
95     {
96     public:
97 
98         using base_class = avx512_fallback_batch_bool<uint16_t, 32>;
99         using base_class::base_class;
100     };
101 
102 
103     namespace detail
104     {
105         template <>
106         struct batch_bool_kernel<int16_t, 32>
107             : avx512_fallback_batch_bool_kernel<int16_t, 32>
108         {
109         };
110 
111         template <>
112         struct batch_bool_kernel<uint16_t, 32>
113             : avx512_fallback_batch_bool_kernel<uint16_t, 32>
114         {
115         };
116     }
117 
118 #endif
119 
120     /**********************
121      * batch<int16_t, 32> *
122      **********************/
123 
124     template <>
125     struct simd_batch_traits<batch<int16_t, 32>>
126     {
127         using value_type = int16_t;
128         static constexpr std::size_t size = 32;
129         using batch_bool_type = batch_bool<int16_t, 32>;
130         static constexpr std::size_t align = 64;
131         using storage_type = __m512i;
132     };
133 
134     template <>
135     struct simd_batch_traits<batch<uint16_t, 32>>
136     {
137         using value_type = uint16_t;
138         static constexpr std::size_t size = 32;
139         using batch_bool_type = batch_bool<uint16_t, 32>;
140         static constexpr std::size_t align = 64;
141         using storage_type = __m512i;
142     };
143 
144     template <>
145     class batch<int16_t, 32> : public avx512_int_batch<int16_t, 32>
146     {
147     public:
148 
149         using base_class = avx512_int_batch;
150         using base_class::base_class;
151         using base_class::load_aligned;
152         using base_class::load_unaligned;
153         using base_class::store_aligned;
154         using base_class::store_unaligned;
155 
156         batch() = default;
157 
batch(const char * src)158         explicit batch(const char* src)
159             : batch(reinterpret_cast<const int16_t*>(src))
160         {
161         }
162 
batch(const char * src,aligned_mode)163         batch(const char* src, aligned_mode)
164             : batch(reinterpret_cast<const int16_t*>(src), aligned_mode{})
165         {
166         }
167 
batch(const char * src,unaligned_mode)168         batch(const char* src, unaligned_mode)
169             : batch(reinterpret_cast<const int16_t*>(src), unaligned_mode{})
170         {
171         }
172 
173         XSIMD_DECLARE_LOAD_STORE_INT16(int16_t, 32)
174         XSIMD_DECLARE_LOAD_STORE_LONG(int16_t, 32)
175     };
176 
177     template <>
178     class batch<uint16_t, 32> : public avx512_int_batch<uint16_t, 32>
179     {
180     public:
181 
182         using base_class = avx512_int_batch;
183         using base_class::base_class;
184         using base_class::load_aligned;
185         using base_class::load_unaligned;
186         using base_class::store_aligned;
187         using base_class::store_unaligned;
188 
189         XSIMD_DECLARE_LOAD_STORE_INT16(uint16_t, 32)
190         XSIMD_DECLARE_LOAD_STORE_LONG(uint16_t, 32)
191     };
192 
193     batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, int32_t rhs);
194     batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, int32_t rhs);
195     batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs);
196     batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs);
197     batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, int32_t rhs);
198     batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, int32_t rhs);
199     batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs);
200     batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs);
201 
202     /*************************************
203      * batch<int16_t, 32> implementation *
204      *************************************/
205 
206     namespace detail
207     {
208         template <class T>
209         struct avx512_int16_batch_kernel
210             : avx512_int_kernel_base<batch<T, 32>>
211         {
212             using batch_type = batch<T, 32>;
213             using value_type = T;
214             using batch_bool_type = batch_bool<T, 32>;
215 
negxsimd::detail::avx512_int16_batch_kernel216             static batch_type neg(const batch_type& rhs)
217             {
218             #if defined(XSIMD_AVX512BW_AVAILABLE)
219                 return _mm512_sub_epi16(_mm512_setzero_si512(), rhs);
220             #else
221                 XSIMD_SPLIT_AVX512(rhs);
222                 __m256i res_low = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_low);
223                 __m256i res_high = _mm256_sub_epi16(_mm256_setzero_si256(), rhs_high);
224                 XSIMD_RETURN_MERGED_AVX(res_low, res_high);
225             #endif
226             }
227 
addxsimd::detail::avx512_int16_batch_kernel228             static batch_type add(const batch_type& lhs, const batch_type& rhs)
229             {
230             #if defined(XSIMD_AVX512BW_AVAILABLE)
231                 return _mm512_add_epi16(lhs, rhs);
232             #else
233                 XSIMD_APPLY_AVX2_FUNCTION_INT16(add, lhs, rhs);
234             #endif
235             }
236 
subxsimd::detail::avx512_int16_batch_kernel237             static batch_type sub(const batch_type& lhs, const batch_type& rhs)
238             {
239             #if defined(XSIMD_AVX512BW_AVAILABLE)
240                 return _mm512_sub_epi16(lhs, rhs);
241             #else
242                 XSIMD_APPLY_AVX2_FUNCTION_INT16(sub, lhs, rhs);
243             #endif
244             }
245 
saddxsimd::detail::avx512_int16_batch_kernel246             static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
247             {
248             #if defined(XSIMD_AVX512BW_AVAILABLE)
249                 return _mm512_adds_epi16(lhs, rhs);
250             #else
251                 XSIMD_APPLY_AVX2_FUNCTION_INT16(sadd, lhs, rhs);
252             #endif
253             }
254 
ssubxsimd::detail::avx512_int16_batch_kernel255             static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
256             {
257             #if defined(XSIMD_AVX512BW_AVAILABLE)
258                 return _mm512_subs_epi16(lhs, rhs);
259             #else
260                 XSIMD_APPLY_AVX2_FUNCTION_INT16(ssub, lhs, rhs);
261             #endif
262             }
263 
mulxsimd::detail::avx512_int16_batch_kernel264             static batch_type mul(const batch_type& lhs, const batch_type& rhs)
265             {
266             #if defined(XSIMD_AVX512BW_AVAILABLE)
267                 return _mm512_mullo_epi16(lhs, rhs);
268             #else
269                 XSIMD_APPLY_AVX2_FUNCTION_INT16(mul, lhs, rhs);
270             #endif
271             }
272 
divxsimd::detail::avx512_int16_batch_kernel273             static batch_type div(const batch_type& lhs, const batch_type& rhs)
274             {
275                 XSIMD_APPLY_AVX2_FUNCTION_INT16(div, lhs, rhs);
276             }
277 
modxsimd::detail::avx512_int16_batch_kernel278             static batch_type mod(const batch_type& lhs, const batch_type& rhs)
279             {
280                 XSIMD_MACRO_UNROLL_BINARY(%);
281             }
282 
bitwise_andxsimd::detail::avx512_int16_batch_kernel283             static batch_type bitwise_and(const batch_type& lhs, const batch_type& rhs)
284             {
285                 return _mm512_and_si512(lhs, rhs);
286             }
287 
bitwise_orxsimd::detail::avx512_int16_batch_kernel288             static batch_type bitwise_or(const batch_type& lhs, const batch_type& rhs)
289             {
290                 return _mm512_or_si512(lhs, rhs);
291             }
292 
bitwise_xorxsimd::detail::avx512_int16_batch_kernel293             static batch_type bitwise_xor(const batch_type& lhs, const batch_type& rhs)
294             {
295                 return _mm512_xor_si512(lhs, rhs);
296             }
297 
bitwise_notxsimd::detail::avx512_int16_batch_kernel298             static batch_type bitwise_not(const batch_type& rhs)
299             {
300                 return _mm512_xor_si512(rhs, _mm512_set1_epi16(-1));
301             }
302 
bitwise_andnotxsimd::detail::avx512_int16_batch_kernel303             static batch_type bitwise_andnot(const batch_type& lhs, const batch_type& rhs)
304             {
305                 return _mm512_andnot_si512(lhs, rhs);
306             }
307 
fmaxsimd::detail::avx512_int16_batch_kernel308             static batch_type fma(const batch_type& x, const batch_type& y, const batch_type& z)
309             {
310                 return x * y + z;
311             }
312 
fmsxsimd::detail::avx512_int16_batch_kernel313             static batch_type fms(const batch_type& x, const batch_type& y, const batch_type& z)
314             {
315                 return x * y - z;
316             }
317 
fnmaxsimd::detail::avx512_int16_batch_kernel318             static batch_type fnma(const batch_type& x, const batch_type& y, const batch_type& z)
319             {
320                 return -x * y + z;
321             }
322 
fnmsxsimd::detail::avx512_int16_batch_kernel323             static batch_type fnms(const batch_type& x, const batch_type& y, const batch_type& z)
324             {
325                 return -x * y - z;
326             }
327 
haddxsimd::detail::avx512_int16_batch_kernel328             static value_type hadd(const batch_type& rhs)
329             {
330                 XSIMD_SPLIT_AVX512(rhs);
331                 auto tmp = batch<value_type, 16>(rhs_low) + batch<value_type, 16>(rhs_high);
332                 return xsimd::hadd(batch<value_type, 16>(tmp));
333             }
334 
selectxsimd::detail::avx512_int16_batch_kernel335             static batch_type select(const batch_bool_type& cond, const batch_type& a, const batch_type& b)
336             {
337             #if defined(XSIMD_AVX512BW_AVAILABLE) && !defined(_MSC_VER)
338                 auto res = _mm512_mask_blend_epi16((__mmask32)cond, (__m512i)b, (__m512i)a);
339                 return batch_type(res);
340             #else
341                 __m512i mcond = _mm512_maskz_broadcastw_epi16((__mmask32)cond, _mm_set1_epi32(~0));
342                 XSIMD_SPLIT_AVX512(mcond);
343                 XSIMD_SPLIT_AVX512(a);
344                 XSIMD_SPLIT_AVX512(b);
345 
346                 auto res_lo = _mm256_blendv_epi8(b_low, a_low, mcond_low);
347                 auto res_hi = _mm256_blendv_epi8(b_high, a_high, mcond_high);
348 
349                 XSIMD_RETURN_MERGED_AVX(res_lo, res_hi);
350             #endif
351             }
352 
zip_loxsimd::detail::avx512_int16_batch_kernel353             static batch_type zip_lo(const batch_type& lhs, const batch_type& rhs)
354             {
355                 return _mm512_unpacklo_epi16(lhs, rhs);
356             }
357 
zip_hixsimd::detail::avx512_int16_batch_kernel358             static batch_type zip_hi(const batch_type& lhs, const batch_type& rhs)
359             {
360                 return _mm512_unpackhi_epi16(lhs, rhs);
361             }
362 
extract_pairxsimd::detail::avx512_int16_batch_kernel363             static batch_type extract_pair(const batch_type& v_lhs, const batch_type& v_rhs, const int num)
364             {
365 #if defined(XSIMD_AVX512BW_AVAILABLE)
366                 const batch_type lhs = v_rhs;
367                 const batch_type rhs = v_lhs;
368                 const int n = 2 * num;
369                 switch(n)
370                 {
371                     case 0: return rhs;
372                     XSIMD_REPEAT_64_v2(_mm512_alignr_epi8);
373                     default: break;
374                 }
375                 return batch_type(T(0));
376 #else
377                 batch_type b_concatenate;
378                 const int n = num;
379                 for (int i = 0 ; i < (32 - n); ++i)
380                 {
381                     b_concatenate[i] = v_lhs[i + n];
382                     if(i < n)
383                     {
384                         b_concatenate[32 - 1 - i] = v_rhs[n - 1 - i];
385                     }
386                 }
387                 return b_concatenate;
388 #endif
389             }
390 
391         };
392 
393         template <>
394         struct batch_kernel<int16_t, 32>
395             : public avx512_int16_batch_kernel<int16_t>
396         {
absxsimd::detail::batch_kernel397             static batch_type abs(const batch_type& rhs)
398             {
399             #if defined(XSIMD_AVX512BW_AVAILABLE)
400                 return _mm512_abs_epi16(rhs);
401             #else
402                 XSIMD_SPLIT_AVX512(rhs);
403                 __m256i res_low = _mm256_abs_epi16(rhs_low);
404                 __m256i res_high = _mm256_abs_epi16(rhs_high);
405                 XSIMD_RETURN_MERGED_AVX(res_low, res_high);
406             #endif
407             }
408 
minxsimd::detail::batch_kernel409             static batch_type min(const batch_type& lhs, const batch_type& rhs)
410             {
411             #if defined(XSIMD_AVX512BW_AVAILABLE)
412                 return _mm512_min_epi16(lhs, rhs);
413             #else
414                 XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs);
415             #endif
416             }
417 
maxxsimd::detail::batch_kernel418             static batch_type max(const batch_type& lhs, const batch_type& rhs)
419             {
420             #if defined(XSIMD_AVX512BW_AVAILABLE)
421                 return _mm512_max_epi16(lhs, rhs);
422             #else
423                 XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs);
424             #endif
425             }
426 
eqxsimd::detail::batch_kernel427             static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs)
428             {
429             #if defined(XSIMD_AVX512BW_AVAILABLE)
430                 return _mm512_cmpeq_epi16_mask(lhs, rhs);
431             #else
432                 XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs);
433             #endif
434             }
435 
neqxsimd::detail::batch_kernel436             static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs)
437             {
438             #if defined(XSIMD_AVX512BW_AVAILABLE)
439                 return _mm512_cmpneq_epi16_mask(lhs, rhs);
440             #else
441                 XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs);
442             #endif
443             }
444 
ltxsimd::detail::batch_kernel445             static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs)
446             {
447             #if defined(XSIMD_AVX512BW_AVAILABLE)
448                 return _mm512_cmplt_epi16_mask(lhs, rhs);
449             #else
450                 XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs);
451             #endif
452             }
453 
ltexsimd::detail::batch_kernel454             static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs)
455             {
456             #if defined(XSIMD_AVX512BW_AVAILABLE)
457                 return _mm512_cmple_epi16_mask(lhs, rhs);
458             #else
459                 XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs);
460             #endif
461             }
462         };
463 
464         template <>
465         struct batch_kernel<uint16_t, 32>
466             : public avx512_int16_batch_kernel<uint16_t>
467         {
absxsimd::detail::batch_kernel468             static batch_type abs(const batch_type& rhs)
469             {
470                 return rhs;
471             }
472 
minxsimd::detail::batch_kernel473             static batch_type min(const batch_type& lhs, const batch_type& rhs)
474             {
475             #if defined(XSIMD_AVX512BW_AVAILABLE)
476                 return _mm512_min_epu16(lhs, rhs);
477             #else
478                 XSIMD_APPLY_AVX2_FUNCTION_INT16(min, lhs, rhs);
479             #endif
480             }
481 
maxxsimd::detail::batch_kernel482             static batch_type max(const batch_type& lhs, const batch_type& rhs)
483             {
484             #if defined(XSIMD_AVX512BW_AVAILABLE)
485                 return _mm512_max_epu16(lhs, rhs);
486             #else
487                 XSIMD_APPLY_AVX2_FUNCTION_INT16(max, lhs, rhs);
488             #endif
489             }
490 
eqxsimd::detail::batch_kernel491             static batch_bool_type eq(const batch_type& lhs, const batch_type& rhs)
492             {
493             #if defined(XSIMD_AVX512BW_AVAILABLE)
494                 return _mm512_cmpeq_epu16_mask(lhs, rhs);
495             #else
496                 XSIMD_APPLY_AVX2_FUNCTION_INT16(eq, lhs, rhs);
497             #endif
498             }
499 
neqxsimd::detail::batch_kernel500             static batch_bool_type neq(const batch_type& lhs, const batch_type& rhs)
501             {
502             #if defined(XSIMD_AVX512BW_AVAILABLE)
503                 return _mm512_cmpneq_epu16_mask(lhs, rhs);
504             #else
505                 XSIMD_APPLY_AVX2_FUNCTION_INT16(neq, lhs, rhs);
506             #endif
507             }
508 
ltxsimd::detail::batch_kernel509             static batch_bool_type lt(const batch_type& lhs, const batch_type& rhs)
510             {
511             #if defined(XSIMD_AVX512BW_AVAILABLE)
512                 return _mm512_cmplt_epu16_mask(lhs, rhs);
513             #else
514                 XSIMD_APPLY_AVX2_FUNCTION_INT16(lt, lhs, rhs);
515             #endif
516             }
517 
ltexsimd::detail::batch_kernel518             static batch_bool_type lte(const batch_type& lhs, const batch_type& rhs)
519             {
520             #if defined(XSIMD_AVX512BW_AVAILABLE)
521                 return _mm512_cmple_epu16_mask(lhs, rhs);
522             #else
523                 XSIMD_APPLY_AVX2_FUNCTION_INT16(lte, lhs, rhs);
524             #endif
525             }
526 
saddxsimd::detail::batch_kernel527             static batch_type sadd(const batch_type &lhs, const batch_type &rhs)
528             {
529             #if defined(XSIMD_AVX512BW_AVAILABLE)
530                 return _mm512_adds_epu16(lhs, rhs);
531             #else
532                 XSIMD_APPLY_AVX2_FUNCTION_UINT16(sadd, lhs, rhs);
533             #endif
534             }
535 
ssubxsimd::detail::batch_kernel536             static batch_type ssub(const batch_type &lhs, const batch_type &rhs)
537             {
538             #if defined(XSIMD_AVX512BW_AVAILABLE)
539                 return _mm512_subs_epu16(lhs, rhs);
540             #else
541                 XSIMD_APPLY_AVX2_FUNCTION_UINT16(ssub, lhs, rhs);
542             #endif
543             }
544         };
545     }
546 
operator <<(const batch<int16_t,32> & lhs,int32_t rhs)547     inline batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, int32_t rhs)
548     {
549 #if defined(XSIMD_AVX512BW_AVAILABLE)
550 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
551         return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs));
552 #else
553         return _mm512_slli_epi16(lhs, rhs);
554 #endif
555 #else
556 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
557         __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs));
558 #else
559         __m512i tmp = _mm512_slli_epi32(lhs, rhs);
560 #endif
561         return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp);
562 #endif
563     }
564 
operator >>(const batch<int16_t,32> & lhs,int32_t rhs)565     inline batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, int32_t rhs)
566     {
567 #if defined(XSIMD_AVX512BW_AVAILABLE)
568 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
569         return _mm512_srav_epi16(lhs, _mm512_set1_epi16(rhs));
570 #else
571         return _mm512_srai_epi16(lhs, rhs);
572 #endif
573 #else
574         return avx512_detail::shift_impl([](int16_t val, int32_t s) { return val >> s; }, lhs, rhs);
575 #endif
576     }
577 
operator <<(const batch<int16_t,32> & lhs,const batch<int16_t,32> & rhs)578     inline batch<int16_t, 32> operator<<(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs)
579     {
580 #if defined(XSIMD_AVX512BW_AVAILABLE)
581         return _mm512_sllv_epi16(lhs, rhs);
582 #else
583         return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val << s; }, lhs, rhs);
584 #endif
585     }
586 
operator >>(const batch<int16_t,32> & lhs,const batch<int16_t,32> & rhs)587     inline batch<int16_t, 32> operator>>(const batch<int16_t, 32>& lhs, const batch<int16_t, 32>& rhs)
588     {
589 #if defined(XSIMD_AVX512BW_AVAILABLE)
590         return _mm512_srav_epi16(lhs, rhs);
591 #else
592         return avx512_detail::shift_impl([](int16_t val, int16_t s) { return val >> s; }, lhs, rhs);
593 #endif
594     }
595 
596     XSIMD_DEFINE_LOAD_STORE_INT16(int16_t, 32, 64)
597     XSIMD_DEFINE_LOAD_STORE_LONG(int16_t, 32, 64)
598 
operator <<(const batch<uint16_t,32> & lhs,int32_t rhs)599     inline batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, int32_t rhs)
600     {
601 #if defined(XSIMD_AVX512BW_AVAILABLE)
602 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
603         return _mm512_sllv_epi16(lhs, _mm512_set1_epi16(rhs));
604 #else
605         return _mm512_slli_epi16(lhs, rhs);
606 #endif
607 #else
608 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
609         __m512i tmp = _mm512_sllv_epi32(lhs, _mm512_set1_epi32(rhs));
610 #else
611         __m512i tmp = _mm512_slli_epi32(lhs, rhs);
612 #endif
613         return _mm512_and_si512(_mm512_set1_epi16(0xFFFF << rhs), tmp);
614 #endif
615     }
616 
operator >>(const batch<uint16_t,32> & lhs,int32_t rhs)617     inline batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, int32_t rhs)
618     {
619 #if defined(XSIMD_AVX512BW_AVAILABLE)
620 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
621         return _mm512_srlv_epi16(lhs, _mm512_set1_epi16(rhs));
622 #else
623         return _mm512_srli_epi16(lhs, rhs);
624 #endif
625 #else
626 #if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
627         __m512i tmp = _mm512_srlv_epi32(lhs, _mm512_set1_epi32(rhs));
628 #else
629         __m512i tmp = _mm512_srli_epi32(lhs, rhs);
630 #endif
631         return _mm512_and_si512(_mm512_set1_epi16(0xFFFF >> rhs), tmp);
632 #endif
633     }
634 
operator <<(const batch<uint16_t,32> & lhs,const batch<int16_t,32> & rhs)635     inline batch<uint16_t, 32> operator<<(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs)
636     {
637 #if defined(XSIMD_AVX512BW_AVAILABLE)
638         return _mm512_sllv_epi16(lhs, rhs);
639 #else
640         return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val << s; }, lhs, rhs);
641 #endif
642     }
643 
operator >>(const batch<uint16_t,32> & lhs,const batch<int16_t,32> & rhs)644     inline batch<uint16_t, 32> operator>>(const batch<uint16_t, 32>& lhs, const batch<int16_t, 32>& rhs)
645     {
646 #if defined(XSIMD_AVX512BW_AVAILABLE)
647         return _mm512_srlv_epi16(lhs, rhs);
648 #else
649         return avx512_detail::shift_impl([](uint16_t val, int16_t s) { return val >> s; }, lhs, rhs);
650 #endif
651     }
652 
653     XSIMD_DEFINE_LOAD_STORE_INT16(uint16_t, 32, 64)
654     XSIMD_DEFINE_LOAD_STORE_LONG(uint16_t, 32, 64)
655 
656 #undef XSIMD_APPLY_AVX2_FUNCTION_INT16
657 }
658 
659 #endif
660