1 /*  Copyright (C) 2013-2014  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/not_implemented.h>
17 #include <simdpp/core/permute_bytes16.h>
18 #include <simdpp/detail/vector_array_macros.h>
19 
20 namespace simdpp {
21 namespace SIMDPP_ARCH_NAMESPACE {
22 namespace detail {
23 namespace insn {
24 
25 #if _MSC_VER
26 #pragma warning(push)
27 #pragma warning(disable: 4800)
28 #endif
29 
30 static SIMDPP_INL
i_permute_zbytes16(const uint8x16 & a,const uint8x16 & mask)31 uint8x16 i_permute_zbytes16(const uint8x16& a, const uint8x16& mask)
32 {
33 #if SIMDPP_USE_NULL
34     uint8x16 r;
35 
36     for (unsigned i = 0; i < 16; i++) {
37         unsigned j = mask.el(i) & 0x0f;
38         bool zero = mask.el(i) & 0x80;
39         r.el(i) = zero ? 0 : a.el(j);
40     }
41     return r;
42 #elif SIMDPP_USE_SSSE3 || SIMDPP_USE_NEON
43     return permute_bytes16(a, mask);
44 #elif SIMDPP_USE_ALTIVEC
45     int8x16 a0 = a;
46     int8x16 zero_mask = mask;
47     zero_mask = shift_r<7>(zero_mask); // shift in the sign bit
48     a0 = i_permute_bytes16(a0, mask);
49     a0 = bit_andnot(a0, zero_mask);
50     return a0;
51 #elif SIMDPP_USE_MSA
52     return (v16u8) __msa_vshf_b((v16i8) mask.native(),
53                                 (v16i8) a.native(),
54                                 (v16i8) a.native());
55 #else
56     return SIMDPP_NOT_IMPLEMENTED2(a, mask);
57 #endif
58 }
59 
60 #if _MSC_VER
61 #pragma warning(pop)
62 #endif
63 
64 #if SIMDPP_USE_AVX2
65 static SIMDPP_INL
i_permute_zbytes16(const uint8x32 & a,const uint8x32 & mask)66 uint8x32 i_permute_zbytes16(const uint8x32& a, const uint8x32& mask)
67 {
68     return _mm256_shuffle_epi8(a.native(), mask.native());
69 }
70 #endif
71 
72 #if SIMDPP_USE_AVX512BW
i_permute_zbytes16(const uint8<64> & a,const uint8<64> & mask)73 SIMDPP_INL uint8<64> i_permute_zbytes16(const uint8<64>& a, const uint8<64>& mask)
74 {
75     return _mm512_shuffle_epi8(a.native(), mask.native());
76 }
77 #endif
78 
79 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint8<N> & a,const uint8<N> & mask)80 uint8<N> i_permute_zbytes16(const uint8<N>& a, const uint8<N>& mask)
81 {
82     SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_permute_zbytes16, a, mask);
83 }
84 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint16<N> & a,const uint16<N> & mask)85 uint16<N> i_permute_zbytes16(const uint16<N>& a, const uint16<N>& mask)
86 {
87     return (uint16<N>) i_permute_zbytes16(uint8<N*2>(a), uint8<N*2>(mask));
88 }
89 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint32<N> & a,const uint32<N> & mask)90 uint32<N> i_permute_zbytes16(const uint32<N>& a, const uint32<N>& mask)
91 {
92     return (uint32<N>) i_permute_zbytes16(uint8<N*4>(a), uint8<N*4>(mask));
93 }
94 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint64<N> & a,const uint64<N> & mask)95 uint64<N> i_permute_zbytes16(const uint64<N>& a, const uint64<N>& mask)
96 {
97     return (uint64<N>) i_permute_zbytes16(uint8<N*8>(a), uint8<N*8>(mask));
98 }
99 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const float32<N> & a,const uint32<N> & mask)100 float32<N> i_permute_zbytes16(const float32<N>& a, const uint32<N>& mask)
101 {
102     return float32<N>(i_permute_zbytes16(uint32<N>(a), mask));
103 }
104 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const float64<N> & a,const uint64<N> & mask)105 float64<N> i_permute_zbytes16(const float64<N>& a, const uint64<N>& mask)
106 {
107     return float64<N>(i_permute_zbytes16(uint64<N>(a), mask));
108 }
109 
110 
111 } // namespace insn
112 } // namespace detail
113 } // namespace SIMDPP_ARCH_NAMESPACE
114 } // namespace simdpp
115 
116 #endif
117 
118