1 /* Copyright (C) 2011-2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/bit_or.h>
18 #include <simdpp/core/extract.h>
19 #include <simdpp/core/i_shift_l.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/core/make_uint.h>
22 #include <simdpp/core/move_l.h>
23
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28
i_extract_bits_any(const uint8<16> & ca)29 SIMDPP_INL uint16_t i_extract_bits_any(const uint8<16>& ca)
30 {
31 uint8<16> a = ca;
32 #if SIMDPP_USE_NULL
33 uint16_t r = 0;
34 for (unsigned i = 0; i < a.length; i++) {
35 uint8_t x = ca.el(i);
36 x = x & 1;
37 r = (r >> 1) | (uint16_t(x) << 15);
38 }
39 return r;
40 #elif SIMDPP_USE_SSE2
41 // Note that i_extract_bits depends on the exact implementation of this
42 // function.
43 return _mm_movemask_epi8(a.native());
44 #elif SIMDPP_USE_NEON
45 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
46
47 a = bit_and(a, mask);
48 uint16<8> a16 = vpaddlq_u8(a.native());
49 uint32<4> a32 = vpaddlq_u16(a16.native());
50 uint8<16> a8 = vreinterpretq_u8_u64(vpaddlq_u32(a32.native()));
51 uint8x8_t r = vzip_u8(vget_low_u8(a8.native()), vget_high_u8(a8.native())).val[0];
52 return vget_lane_u16(vreinterpret_u16_u8(r), 0);
53 #elif SIMDPP_USE_ALTIVEC
54 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
55 a = bit_and(a, mask);
56 uint32<4> zero = make_zero();
57 uint32x4 s = vec_sum4s(a.native(), zero.native());
58 uint32x4 shifts = make_uint(0, 0, 8, 8);
59 s = (__vector uint32_t) vec_sl(s.native(), shifts.native());
60 s = (int32x4)vec_sums((__vector int32_t)s.native(),
61 (__vector int32_t)zero.native());
62 #if SIMDPP_BIG_ENDIAN
63 return extract<7>(uint16x8(s));
64 #else
65 return extract<6>(uint16x8(s));
66 #endif
67 #elif SIMDPP_USE_MSA
68 // Note: the implementation of extract_bits depends of the exact behavior
69 // of this function
70 uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
71
72 a = bit_and(a, mask);
73 uint16<8> a16 = __msa_hadd_u_h(a.native(), a.native());
74 uint32<4> a32 = __msa_hadd_u_w(a16.native(), a16.native());
75 a = (v16u8) __msa_hadd_u_d(a32.native(), a32.native());
76 a = bit_or(a, move16_l<7>(a));
77 return extract<0>((uint16<8>)a);
78 #endif
79 }
80
i_extract_bits_any(const uint8<32> & ca)81 SIMDPP_INL uint32_t i_extract_bits_any(const uint8<32>& ca)
82 {
83 uint8<32> a = ca;
84 #if SIMDPP_USE_AVX2
85 return _mm256_movemask_epi8(a.native());
86 #else
87 uint8<16> lo, hi;
88 split(a, lo, hi);
89 return i_extract_bits_any(lo) | (i_extract_bits_any(hi) << 16);
90 #endif
91 }
92
93 template<unsigned id> SIMDPP_INL
i_extract_bits(const uint8<16> & ca)94 uint16_t i_extract_bits(const uint8<16>& ca)
95 {
96 uint8<16> a = ca;
97 #if SIMDPP_USE_NULL
98 uint16_t r = 0;
99 for (unsigned i = 0; i < a.length; i++) {
100 uint8_t x = ca.el(i);
101 x = (x >> id) & 1;
102 r = (r >> 1) | (uint16_t(x) << 15);
103 }
104 return r;
105 #elif SIMDPP_USE_SSE2
106 a = shift_l<7-id>((uint16x8) a);
107 return i_extract_bits_any(a);
108 #elif SIMDPP_USE_NEON
109 int8x16 shift_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
110 4-int(id), 5-int(id), 6-int(id), 7-int(id));
111
112 a = vshlq_u8(a.native(), shift_mask.native());
113 return i_extract_bits_any(a);
114 #elif SIMDPP_USE_ALTIVEC
115 uint8x16 rot_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
116 4-int(id), 5-int(id), 6-int(id), 7-int(id));
117 a = vec_rl(a.native(), rot_mask.native());
118 return i_extract_bits_any(a);
119 #elif SIMDPP_USE_MSA
120 int8x16 shifts = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
121 4-int(id), 5-int(id), 6-int(id), 7-int(id));
122 uint8<16> a_l = (v16u8) __msa_sll_b((v16i8) a.native(), shifts.native());
123 shifts = sub((int8<16>) make_zero(), shifts);
124 uint8<16> a_r = (v16u8) __msa_srl_b((v16i8) a.native(), shifts.native());
125 a = bit_or(a_l, a_r);
126 return i_extract_bits_any(a);
127 #endif
128 }
129
130 template<unsigned id> SIMDPP_INL
i_extract_bits(const uint8<32> & ca)131 uint32_t i_extract_bits(const uint8<32>& ca)
132 {
133 uint8<32> a = ca;
134 #if SIMDPP_USE_AVX2
135 a = shift_l<7-id>((uint16<16>) a);
136 return i_extract_bits_any(a);
137 #else
138 uint8<16> lo, hi;
139 split(a, lo, hi);
140 return i_extract_bits<id>(lo) | (i_extract_bits<id>(hi) << 16);
141 #endif
142 }
143
144 } // namespace insn
145 } // namespace detail
146 } // namespace SIMDPP_ARCH_NAMESPACE
147 } // namespace simdpp
148
149 #endif
150
151
152