1 /* Copyright (C) 2013-2014 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_PERMUTE_ZBYTES16_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/detail/not_implemented.h>
17 #include <simdpp/core/permute_bytes16.h>
18 #include <simdpp/detail/vector_array_macros.h>
19
20 namespace simdpp {
21 namespace SIMDPP_ARCH_NAMESPACE {
22 namespace detail {
23 namespace insn {
24
25 #if _MSC_VER
26 #pragma warning(push)
27 #pragma warning(disable: 4800)
28 #endif
29
30 static SIMDPP_INL
i_permute_zbytes16(const uint8x16 & a,const uint8x16 & mask)31 uint8x16 i_permute_zbytes16(const uint8x16& a, const uint8x16& mask)
32 {
33 #if SIMDPP_USE_NULL
34 uint8x16 r;
35
36 for (unsigned i = 0; i < 16; i++) {
37 unsigned j = mask.el(i) & 0x0f;
38 bool zero = mask.el(i) & 0x80;
39 r.el(i) = zero ? 0 : a.el(j);
40 }
41 return r;
42 #elif SIMDPP_USE_SSSE3 || SIMDPP_USE_NEON
43 return permute_bytes16(a, mask);
44 #elif SIMDPP_USE_ALTIVEC
45 int8x16 a0 = a;
46 int8x16 zero_mask = mask;
47 zero_mask = shift_r<7>(zero_mask); // shift in the sign bit
48 a0 = i_permute_bytes16(a0, mask);
49 a0 = bit_andnot(a0, zero_mask);
50 return a0;
51 #elif SIMDPP_USE_MSA
52 return (v16u8) __msa_vshf_b((v16i8) mask.native(),
53 (v16i8) a.native(),
54 (v16i8) a.native());
55 #else
56 return SIMDPP_NOT_IMPLEMENTED2(a, mask);
57 #endif
58 }
59
60 #if _MSC_VER
61 #pragma warning(pop)
62 #endif
63
64 #if SIMDPP_USE_AVX2
65 static SIMDPP_INL
i_permute_zbytes16(const uint8x32 & a,const uint8x32 & mask)66 uint8x32 i_permute_zbytes16(const uint8x32& a, const uint8x32& mask)
67 {
68 return _mm256_shuffle_epi8(a.native(), mask.native());
69 }
70 #endif
71
72 #if SIMDPP_USE_AVX512BW
i_permute_zbytes16(const uint8<64> & a,const uint8<64> & mask)73 SIMDPP_INL uint8<64> i_permute_zbytes16(const uint8<64>& a, const uint8<64>& mask)
74 {
75 return _mm512_shuffle_epi8(a.native(), mask.native());
76 }
77 #endif
78
79 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint8<N> & a,const uint8<N> & mask)80 uint8<N> i_permute_zbytes16(const uint8<N>& a, const uint8<N>& mask)
81 {
82 SIMDPP_VEC_ARRAY_IMPL2(uint8<N>, i_permute_zbytes16, a, mask);
83 }
84 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint16<N> & a,const uint16<N> & mask)85 uint16<N> i_permute_zbytes16(const uint16<N>& a, const uint16<N>& mask)
86 {
87 return (uint16<N>) i_permute_zbytes16(uint8<N*2>(a), uint8<N*2>(mask));
88 }
89 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint32<N> & a,const uint32<N> & mask)90 uint32<N> i_permute_zbytes16(const uint32<N>& a, const uint32<N>& mask)
91 {
92 return (uint32<N>) i_permute_zbytes16(uint8<N*4>(a), uint8<N*4>(mask));
93 }
94 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const uint64<N> & a,const uint64<N> & mask)95 uint64<N> i_permute_zbytes16(const uint64<N>& a, const uint64<N>& mask)
96 {
97 return (uint64<N>) i_permute_zbytes16(uint8<N*8>(a), uint8<N*8>(mask));
98 }
99 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const float32<N> & a,const uint32<N> & mask)100 float32<N> i_permute_zbytes16(const float32<N>& a, const uint32<N>& mask)
101 {
102 return float32<N>(i_permute_zbytes16(uint32<N>(a), mask));
103 }
104 template<unsigned N> SIMDPP_INL
i_permute_zbytes16(const float64<N> & a,const uint64<N> & mask)105 float64<N> i_permute_zbytes16(const float64<N>& a, const uint64<N>& mask)
106 {
107 return float64<N>(i_permute_zbytes16(uint64<N>(a), mask));
108 }
109
110
111 } // namespace insn
112 } // namespace detail
113 } // namespace SIMDPP_ARCH_NAMESPACE
114 } // namespace simdpp
115
116 #endif
117
118