1 /*  Copyright (C) 2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/i_add.h>
19 #include <simdpp/core/i_shift_r.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/core/i_mul.h>
22 #include <simdpp/core/insert.h>
23 #include <simdpp/detail/null/bitwise.h>
24 #include <simdpp/detail/width.h>
25 #include <simdpp/detail/vector_array_macros.h>
26 
27 namespace simdpp {
28 namespace SIMDPP_ARCH_NAMESPACE {
29 namespace detail {
30 namespace insn {
31 
32 template<class V> SIMDPP_INL
v_emul_popcnt_u8(const V & a)33 V v_emul_popcnt_u8(const V& a)
34 {
35     // We're using 16-bit ops because on SSE/AVX no 8-bit shift is available
36     // There's no difference on other architectures
37     using w_b16 = typename same_width<V>::u16;
38 
39     w_b16 p = (w_b16)a;
40     w_b16 m55 = splat(0x5555);
41     w_b16 m33 = splat(0x3333);
42     w_b16 m0f = splat(0x0f0f);
43 
44     p = sub(p, bit_and(shift_r<1>(p), m55));
45     p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
46     p = bit_and(add(p, shift_r<4>(p)), m0f);
47     return (V) p;
48 }
49 
50 static SIMDPP_INL
i_popcnt(const uint8<16> & a)51 uint8<16> i_popcnt(const uint8<16>& a)
52 {
53 #if SIMDPP_USE_NULL
54     uint8<16> r;
55     for (unsigned i = 0; i < a.length; i++) {
56         r.el(i) = detail::null::el_popcnt8(a.el(i));
57     }
58     return r;
59 #elif SIMDPP_USE_NEON
60     return vcntq_u8(a.native());
61 #elif SIMDPP_USE_VSX_207
62     return vec_vpopcnt(a.native());
63 #elif SIMDPP_USE_MSA
64     return (v16u8) __msa_pcnt_b((v16i8) a.native());
65 #else
66     return v_emul_popcnt_u8(a);
67 #endif
68 }
69 
70 #if SIMDPP_USE_AVX2
71 static SIMDPP_INL
i_popcnt(const uint8<32> & a)72 uint8<32> i_popcnt(const uint8<32>& a)
73 {
74     return v_emul_popcnt_u8(a);
75 }
76 #endif
77 
78 #if SIMDPP_USE_AVX512BW
79 static SIMDPP_INL
i_popcnt(const uint8<64> & a)80 uint8<64> i_popcnt(const uint8<64>& a)
81 {
82     return v_emul_popcnt_u8(a);
83 }
84 #endif
85 
86 // -----------------------------------------------------------------------------
87 
88 template<class V> SIMDPP_INL
v_emul_popcnt_u16(const V & a)89 V v_emul_popcnt_u16(const V& a)
90 {
91     V p = a;
92     V m55 = splat(0x5555);
93     V m33 = splat(0x3333);
94     V m0f = splat(0x0f0f);
95     V res_mask = splat(0x00ff);
96 
97 
98     p = sub(p, bit_and(shift_r<1>(p), m55));
99     p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
100     p = bit_and(add(p, shift_r<4>(p)), m0f);
101     p = add(p, shift_r<8>(p));
102     p = bit_and(p, res_mask);
103     return p;
104 }
105 
106 static SIMDPP_INL
i_popcnt(const uint16<8> & a)107 uint16<8> i_popcnt(const uint16<8>& a)
108 {
109 #if SIMDPP_USE_NULL
110     uint16<8> r;
111     for (unsigned i = 0; i < a.length; i++) {
112         r.el(i) = detail::null::el_popcnt16(a.el(i));
113     }
114     return r;
115 #elif SIMDPP_USE_NEON
116     uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u16(a.native()));
117     return vpaddlq_u8(p8);
118 #elif SIMDPP_USE_VSX_207
119     return vec_vpopcnt(a.native());
120 #elif SIMDPP_USE_MSA
121     return (v8u16) __msa_pcnt_h((v8i16) a.native());
122 #else
123     return v_emul_popcnt_u16(a);
124 #endif
125 }
126 
127 #if SIMDPP_USE_AVX2
128 static SIMDPP_INL
i_popcnt(const uint16<16> & a)129 uint16<16> i_popcnt(const uint16<16>& a)
130 {
131     return v_emul_popcnt_u16(a);
132 }
133 #endif
134 
135 #if SIMDPP_USE_AVX512BW
136 static SIMDPP_INL
i_popcnt(const uint16<32> & a)137 uint16<32> i_popcnt(const uint16<32>& a)
138 {
139     return v_emul_popcnt_u16(a);
140 }
141 #endif
142 
143 // -----------------------------------------------------------------------------
144 
145 template<class V> SIMDPP_INL
v_emul_popcnt_u32(const V & a)146 V v_emul_popcnt_u32(const V& a)
147 {
148     V p = a;
149     V m55 = splat(0x55555555);
150     V m33 = splat(0x33333333);
151     V m0f = splat(0x0f0f0f0f);
152 
153     p = sub(p, bit_and(shift_r<1>(p), m55));
154     p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
155     p = bit_and(add(p, shift_r<4>(p)), m0f);
156 #if SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_MSA
157     V m01 = splat(0x01010101);
158     // rather than doing 2 adds + 2 shifts we can do 1 mul + 1 shift
159     p = shift_r<24>(mul_lo(p, m01));
160 #else
161     V res_mask = splat(0x000000ff);
162     p = add(p, shift_r<8>(p));
163     p = add(p, shift_r<16>(p));
164     p = bit_and(p, res_mask);
165 #endif
166     return p;
167 }
168 
169 static SIMDPP_INL
i_popcnt(const uint32<4> & a)170 uint32<4> i_popcnt(const uint32<4>& a)
171 {
172 #if SIMDPP_USE_NULL
173     uint32<4> r;
174     for (unsigned i = 0; i < a.length; i++) {
175         r.el(i) = detail::null::el_popcnt32(a.el(i));
176     }
177     return r;
178 #elif SIMDPP_USE_X86_POPCNT_INSN
179     // slightly faster than the vectorized version
180     unsigned a0 = _mm_popcnt_u32(extract<0>(a));
181     unsigned a1 = _mm_popcnt_u32(extract<1>(a));
182     unsigned a2 = _mm_popcnt_u32(extract<2>(a));
183     unsigned a3 = _mm_popcnt_u32(extract<3>(a));
184     uint16<8> r = _mm_cvtsi32_si128(a0);
185     r = insert<2>(r, a1);
186     r = insert<4>(r, a2);
187     r = insert<6>(r, a3);
188     return (uint32<4>) r;
189 #elif SIMDPP_USE_NEON
190     uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u32(a.native()));
191     uint16x8_t p16 = vpaddlq_u8(p8);
192     return vpaddlq_u16(p16);
193 #elif SIMDPP_USE_VSX_207
194     return vec_vpopcnt(a.native());
195 #elif SIMDPP_USE_MSA
196     return (v4u32) __msa_pcnt_w((v4i32) a.native());
197 #else
198     return v_emul_popcnt_u32(a);
199 #endif
200 }
201 
202 #if SIMDPP_USE_AVX2
203 static SIMDPP_INL
i_popcnt(const uint32<8> & a)204 uint32<8> i_popcnt(const uint32<8>& a)
205 {
206     return v_emul_popcnt_u32(a);
207 }
208 #endif
209 
210 #if SIMDPP_USE_AVX512F
211 static SIMDPP_INL
i_popcnt(const uint32<16> & a)212 uint32<16> i_popcnt(const uint32<16>& a)
213 {
214     // TODO: support AVX512VPOPCNTDQ
215     return v_emul_popcnt_u32(a);
216 }
217 #endif
218 
219 // -----------------------------------------------------------------------------
220 
221 template<class V> SIMDPP_INL
v_emul_popcnt_u64(const V & a)222 V v_emul_popcnt_u64(const V& a)
223 {
224     V p = a;
225     V m55 = splat(0x5555555555555555);
226     V m33 = splat(0x3333333333333333);
227     V m0f = splat(0x0f0f0f0f0f0f0f0f);
228     V res_mask = splat(0x00000000000000ff);
229 
230 
231     p = sub(p, bit_and(shift_r<1>(p), m55));
232     p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
233     p = bit_and(add(p, shift_r<4>(p)), m0f);
234     p = add(p, shift_r<8>(p));
235     p = add(p, shift_r<16>(p));
236     p = add(p, shift_r<32>(p));
237     p = bit_and(p, res_mask);
238     return p;
239 }
240 
241 static SIMDPP_INL
i_popcnt(const uint64<2> & a)242 uint64<2> i_popcnt(const uint64<2>& a)
243 {
244 #if SIMDPP_USE_NULL
245     uint64<2> r;
246     for (unsigned i = 0; i < a.length; i++) {
247         r.el(i) = detail::null::el_popcnt64(a.el(i));
248     }
249     return r;
250 #elif SIMDPP_USE_X86_POPCNT_INSN
251     unsigned a0, a1;
252 #if SIMDPP_64_BITS
253     a0 = _mm_popcnt_u64(extract<0>(a));
254     a1 = _mm_popcnt_u64(extract<1>(a));
255 #else
256     uint32<4> a32; a32 = a;
257     a0 =  _mm_popcnt_u32(extract<0>(a32));
258     a0 += _mm_popcnt_u32(extract<1>(a32));
259     a1 =  _mm_popcnt_u32(extract<2>(a32));
260     a1 += _mm_popcnt_u32(extract<3>(a32));
261 #endif
262     uint16<8> r = _mm_cvtsi32_si128(a0);
263     r = insert<4>(r, a1);
264     return (uint64<2>) r;
265 #elif SIMDPP_USE_SSE2
266     uint8<16> p8 = v_emul_popcnt_u8((uint8<16>) a);
267     return _mm_sad_epu8(p8.native(), _mm_setzero_si128());
268 #elif SIMDPP_USE_NEON
269     uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u64(a.native()));
270     uint16x8_t p16 = vpaddlq_u8(p8);
271     uint32x4_t p32 = vpaddlq_u16(p16);
272     return vpaddlq_u32(p32);
273 #elif SIMDPP_USE_VSX_207
274     return vec_vpopcnt(a.native());
275 #elif SIMDPP_USE_MSA
276     return (v2u64) __msa_pcnt_d((v2i64) a.native());
277 #else
278     return v_emul_popcnt_u64(a);
279 #endif
280 }
281 
282 #if SIMDPP_USE_AVX2
283 static SIMDPP_INL
i_popcnt(const uint64<4> & a)284 uint64<4> i_popcnt(const uint64<4>& a)
285 {
286 #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
287     uint64<2> a0, a1;
288     split(a, a0, a1);
289     a0 = i_popcnt(a0);
290     a1 = i_popcnt(a1);
291     return combine(a0, a1);
292 #else
293     uint8<32> p8 = v_emul_popcnt_u8((uint8<32>) a);
294     return _mm256_sad_epu8(p8.native(), _mm256_setzero_si256());
295 #endif
296 }
297 #endif
298 
299 #if SIMDPP_USE_AVX512F
300 static SIMDPP_INL
i_popcnt(const uint64<8> & a)301 uint64<8> i_popcnt(const uint64<8>& a)
302 {
303     // TODO: support AVX512VPOPCNTDQ
304 #if SIMDPP_USE_AVX512BW
305     uint8<64> p8 = v_emul_popcnt_u8((uint8<64>) a);
306     return _mm512_sad_epu8(p8.native(), _mm512_setzero_si512());
307 #else
308     return v_emul_popcnt_u64(a);
309 #endif
310 }
311 #endif
312 
313 // -----------------------------------------------------------------------------
314 
315 template<class V> SIMDPP_INL
i_popcnt(const V & a)316 V i_popcnt(const V& a)
317 {
318     SIMDPP_VEC_ARRAY_IMPL1(V, i_popcnt, a)
319 }
320 
321 
322 } // namespace insn
323 } // namespace detail
324 } // namespace SIMDPP_ARCH_NAMESPACE
325 } // namespace simdpp
326 
327 #endif
328 
329