1 /*  Copyright (C) 2011-2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_EXTRACT_BITS_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/bit_or.h>
18 #include <simdpp/core/extract.h>
19 #include <simdpp/core/i_shift_l.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/core/make_uint.h>
22 #include <simdpp/core/move_l.h>
23 
24 namespace simdpp {
25 namespace SIMDPP_ARCH_NAMESPACE {
26 namespace detail {
27 namespace insn {
28 
i_extract_bits_any(const uint8<16> & ca)29 SIMDPP_INL uint16_t i_extract_bits_any(const uint8<16>& ca)
30 {
31     uint8<16> a = ca;
32 #if SIMDPP_USE_NULL
33     uint16_t r = 0;
34     for (unsigned i = 0; i < a.length; i++) {
35         uint8_t x = ca.el(i);
36         x = x & 1;
37         r = (r >> 1) | (uint16_t(x) << 15);
38     }
39     return r;
40 #elif SIMDPP_USE_SSE2
41     // Note that i_extract_bits depends on the exact implementation of this
42     // function.
43     return _mm_movemask_epi8(a.native());
44 #elif SIMDPP_USE_NEON
45     uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
46 
47     a = bit_and(a, mask);
48     uint16<8> a16 = vpaddlq_u8(a.native());
49     uint32<4> a32 = vpaddlq_u16(a16.native());
50     uint8<16> a8 = vreinterpretq_u8_u64(vpaddlq_u32(a32.native()));
51     uint8x8_t r = vzip_u8(vget_low_u8(a8.native()), vget_high_u8(a8.native())).val[0];
52     return vget_lane_u16(vreinterpret_u16_u8(r), 0);
53 #elif SIMDPP_USE_ALTIVEC
54     uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
55     a = bit_and(a, mask);
56     uint32<4> zero = make_zero();
57     uint32x4 s = vec_sum4s(a.native(), zero.native());
58     uint32x4 shifts = make_uint(0, 0, 8, 8);
59     s = (__vector uint32_t) vec_sl(s.native(), shifts.native());
60     s = (int32x4)vec_sums((__vector int32_t)s.native(),
61                           (__vector int32_t)zero.native());
62 #if SIMDPP_BIG_ENDIAN
63     return extract<7>(uint16x8(s));
64 #else
65     return extract<6>(uint16x8(s));
66 #endif
67 #elif SIMDPP_USE_MSA
68     // Note: the implementation of extract_bits depends of the exact behavior
69     // of this function
70     uint8x16 mask = make_uint(0x01,0x02,0x04,0x08,0x10,0x20,0x40,0x80);
71 
72     a = bit_and(a, mask);
73     uint16<8> a16 = __msa_hadd_u_h(a.native(), a.native());
74     uint32<4> a32 = __msa_hadd_u_w(a16.native(), a16.native());
75     a = (v16u8) __msa_hadd_u_d(a32.native(), a32.native());
76     a = bit_or(a, move16_l<7>(a));
77     return extract<0>((uint16<8>)a);
78 #endif
79 }
80 
i_extract_bits_any(const uint8<32> & ca)81 SIMDPP_INL uint32_t i_extract_bits_any(const uint8<32>& ca)
82 {
83     uint8<32> a = ca;
84 #if SIMDPP_USE_AVX2
85     return _mm256_movemask_epi8(a.native());
86 #else
87     uint8<16> lo, hi;
88     split(a, lo, hi);
89     return i_extract_bits_any(lo) | (i_extract_bits_any(hi) << 16);
90 #endif
91 }
92 
93 template<unsigned id> SIMDPP_INL
i_extract_bits(const uint8<16> & ca)94 uint16_t i_extract_bits(const uint8<16>& ca)
95 {
96     uint8<16> a = ca;
97 #if SIMDPP_USE_NULL
98     uint16_t r = 0;
99     for (unsigned i = 0; i < a.length; i++) {
100         uint8_t x = ca.el(i);
101         x = (x >> id) & 1;
102         r = (r >> 1) | (uint16_t(x) << 15);
103     }
104     return r;
105 #elif SIMDPP_USE_SSE2
106     a = shift_l<7-id>((uint16x8) a);
107     return i_extract_bits_any(a);
108 #elif SIMDPP_USE_NEON
109     int8x16 shift_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
110                                   4-int(id), 5-int(id), 6-int(id), 7-int(id));
111 
112     a = vshlq_u8(a.native(), shift_mask.native());
113     return i_extract_bits_any(a);
114 #elif SIMDPP_USE_ALTIVEC
115     uint8x16 rot_mask = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
116                                  4-int(id), 5-int(id), 6-int(id), 7-int(id));
117     a = vec_rl(a.native(), rot_mask.native());
118     return i_extract_bits_any(a);
119 #elif SIMDPP_USE_MSA
120     int8x16 shifts = make_int(0-int(id), 1-int(id), 2-int(id), 3-int(id),
121                               4-int(id), 5-int(id), 6-int(id), 7-int(id));
122     uint8<16> a_l = (v16u8) __msa_sll_b((v16i8) a.native(), shifts.native());
123     shifts = sub((int8<16>) make_zero(), shifts);
124     uint8<16> a_r = (v16u8) __msa_srl_b((v16i8) a.native(), shifts.native());
125     a = bit_or(a_l, a_r);
126     return i_extract_bits_any(a);
127 #endif
128 }
129 
130 template<unsigned id> SIMDPP_INL
i_extract_bits(const uint8<32> & ca)131 uint32_t i_extract_bits(const uint8<32>& ca)
132 {
133     uint8<32> a = ca;
134 #if SIMDPP_USE_AVX2
135     a = shift_l<7-id>((uint16<16>) a);
136     return i_extract_bits_any(a);
137 #else
138     uint8<16> lo, hi;
139     split(a, lo, hi);
140     return i_extract_bits<id>(lo) | (i_extract_bits<id>(hi) << 16);
141 #endif
142 }
143 
144 } // namespace insn
145 } // namespace detail
146 } // namespace SIMDPP_ARCH_NAMESPACE
147 } // namespace simdpp
148 
149 #endif
150 
151 
152