1 /* Copyright (C) 2017 Povilas Kanapickas <povilas@radix.lt>
2
3 Distributed under the Boost Software License, Version 1.0.
4 (See accompanying file LICENSE_1_0.txt or copy at
5 http://www.boost.org/LICENSE_1_0.txt)
6 */
7
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_POPCNT_H
10
11 #ifndef LIBSIMDPP_SIMD_H
12 #error "This file must be included through simd.h"
13 #endif
14
15 #include <simdpp/types.h>
16 #include <simdpp/core/bit_and.h>
17 #include <simdpp/core/extract.h>
18 #include <simdpp/core/i_add.h>
19 #include <simdpp/core/i_shift_r.h>
20 #include <simdpp/core/i_sub.h>
21 #include <simdpp/core/i_mul.h>
22 #include <simdpp/core/insert.h>
23 #include <simdpp/detail/null/bitwise.h>
24 #include <simdpp/detail/width.h>
25 #include <simdpp/detail/vector_array_macros.h>
26
27 namespace simdpp {
28 namespace SIMDPP_ARCH_NAMESPACE {
29 namespace detail {
30 namespace insn {
31
32 template<class V> SIMDPP_INL
v_emul_popcnt_u8(const V & a)33 V v_emul_popcnt_u8(const V& a)
34 {
35 // We're using 16-bit ops because on SSE/AVX no 8-bit shift is available
36 // There's no difference on other architectures
37 using w_b16 = typename same_width<V>::u16;
38
39 w_b16 p = (w_b16)a;
40 w_b16 m55 = splat(0x5555);
41 w_b16 m33 = splat(0x3333);
42 w_b16 m0f = splat(0x0f0f);
43
44 p = sub(p, bit_and(shift_r<1>(p), m55));
45 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
46 p = bit_and(add(p, shift_r<4>(p)), m0f);
47 return (V) p;
48 }
49
50 static SIMDPP_INL
i_popcnt(const uint8<16> & a)51 uint8<16> i_popcnt(const uint8<16>& a)
52 {
53 #if SIMDPP_USE_NULL
54 uint8<16> r;
55 for (unsigned i = 0; i < a.length; i++) {
56 r.el(i) = detail::null::el_popcnt8(a.el(i));
57 }
58 return r;
59 #elif SIMDPP_USE_NEON
60 return vcntq_u8(a.native());
61 #elif SIMDPP_USE_VSX_207
62 return vec_vpopcnt(a.native());
63 #elif SIMDPP_USE_MSA
64 return (v16u8) __msa_pcnt_b((v16i8) a.native());
65 #else
66 return v_emul_popcnt_u8(a);
67 #endif
68 }
69
70 #if SIMDPP_USE_AVX2
71 static SIMDPP_INL
i_popcnt(const uint8<32> & a)72 uint8<32> i_popcnt(const uint8<32>& a)
73 {
74 return v_emul_popcnt_u8(a);
75 }
76 #endif
77
78 #if SIMDPP_USE_AVX512BW
79 static SIMDPP_INL
i_popcnt(const uint8<64> & a)80 uint8<64> i_popcnt(const uint8<64>& a)
81 {
82 return v_emul_popcnt_u8(a);
83 }
84 #endif
85
86 // -----------------------------------------------------------------------------
87
88 template<class V> SIMDPP_INL
v_emul_popcnt_u16(const V & a)89 V v_emul_popcnt_u16(const V& a)
90 {
91 V p = a;
92 V m55 = splat(0x5555);
93 V m33 = splat(0x3333);
94 V m0f = splat(0x0f0f);
95 V res_mask = splat(0x00ff);
96
97
98 p = sub(p, bit_and(shift_r<1>(p), m55));
99 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
100 p = bit_and(add(p, shift_r<4>(p)), m0f);
101 p = add(p, shift_r<8>(p));
102 p = bit_and(p, res_mask);
103 return p;
104 }
105
106 static SIMDPP_INL
i_popcnt(const uint16<8> & a)107 uint16<8> i_popcnt(const uint16<8>& a)
108 {
109 #if SIMDPP_USE_NULL
110 uint16<8> r;
111 for (unsigned i = 0; i < a.length; i++) {
112 r.el(i) = detail::null::el_popcnt16(a.el(i));
113 }
114 return r;
115 #elif SIMDPP_USE_NEON
116 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u16(a.native()));
117 return vpaddlq_u8(p8);
118 #elif SIMDPP_USE_VSX_207
119 return vec_vpopcnt(a.native());
120 #elif SIMDPP_USE_MSA
121 return (v8u16) __msa_pcnt_h((v8i16) a.native());
122 #else
123 return v_emul_popcnt_u16(a);
124 #endif
125 }
126
127 #if SIMDPP_USE_AVX2
128 static SIMDPP_INL
i_popcnt(const uint16<16> & a)129 uint16<16> i_popcnt(const uint16<16>& a)
130 {
131 return v_emul_popcnt_u16(a);
132 }
133 #endif
134
135 #if SIMDPP_USE_AVX512BW
136 static SIMDPP_INL
i_popcnt(const uint16<32> & a)137 uint16<32> i_popcnt(const uint16<32>& a)
138 {
139 return v_emul_popcnt_u16(a);
140 }
141 #endif
142
143 // -----------------------------------------------------------------------------
144
145 template<class V> SIMDPP_INL
v_emul_popcnt_u32(const V & a)146 V v_emul_popcnt_u32(const V& a)
147 {
148 V p = a;
149 V m55 = splat(0x55555555);
150 V m33 = splat(0x33333333);
151 V m0f = splat(0x0f0f0f0f);
152
153 p = sub(p, bit_and(shift_r<1>(p), m55));
154 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
155 p = bit_and(add(p, shift_r<4>(p)), m0f);
156 #if SIMDPP_USE_SSE4_1 || SIMDPP_USE_NEON || SIMDPP_USE_MSA
157 V m01 = splat(0x01010101);
158 // rather than doing 2 adds + 2 shifts we can do 1 mul + 1 shift
159 p = shift_r<24>(mul_lo(p, m01));
160 #else
161 V res_mask = splat(0x000000ff);
162 p = add(p, shift_r<8>(p));
163 p = add(p, shift_r<16>(p));
164 p = bit_and(p, res_mask);
165 #endif
166 return p;
167 }
168
169 static SIMDPP_INL
i_popcnt(const uint32<4> & a)170 uint32<4> i_popcnt(const uint32<4>& a)
171 {
172 #if SIMDPP_USE_NULL
173 uint32<4> r;
174 for (unsigned i = 0; i < a.length; i++) {
175 r.el(i) = detail::null::el_popcnt32(a.el(i));
176 }
177 return r;
178 #elif SIMDPP_USE_X86_POPCNT_INSN
179 // slightly faster than the vectorized version
180 unsigned a0 = _mm_popcnt_u32(extract<0>(a));
181 unsigned a1 = _mm_popcnt_u32(extract<1>(a));
182 unsigned a2 = _mm_popcnt_u32(extract<2>(a));
183 unsigned a3 = _mm_popcnt_u32(extract<3>(a));
184 uint16<8> r = _mm_cvtsi32_si128(a0);
185 r = insert<2>(r, a1);
186 r = insert<4>(r, a2);
187 r = insert<6>(r, a3);
188 return (uint32<4>) r;
189 #elif SIMDPP_USE_NEON
190 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u32(a.native()));
191 uint16x8_t p16 = vpaddlq_u8(p8);
192 return vpaddlq_u16(p16);
193 #elif SIMDPP_USE_VSX_207
194 return vec_vpopcnt(a.native());
195 #elif SIMDPP_USE_MSA
196 return (v4u32) __msa_pcnt_w((v4i32) a.native());
197 #else
198 return v_emul_popcnt_u32(a);
199 #endif
200 }
201
202 #if SIMDPP_USE_AVX2
203 static SIMDPP_INL
i_popcnt(const uint32<8> & a)204 uint32<8> i_popcnt(const uint32<8>& a)
205 {
206 return v_emul_popcnt_u32(a);
207 }
208 #endif
209
210 #if SIMDPP_USE_AVX512F
211 static SIMDPP_INL
i_popcnt(const uint32<16> & a)212 uint32<16> i_popcnt(const uint32<16>& a)
213 {
214 // TODO: support AVX512VPOPCNTDQ
215 return v_emul_popcnt_u32(a);
216 }
217 #endif
218
219 // -----------------------------------------------------------------------------
220
221 template<class V> SIMDPP_INL
v_emul_popcnt_u64(const V & a)222 V v_emul_popcnt_u64(const V& a)
223 {
224 V p = a;
225 V m55 = splat(0x5555555555555555);
226 V m33 = splat(0x3333333333333333);
227 V m0f = splat(0x0f0f0f0f0f0f0f0f);
228 V res_mask = splat(0x00000000000000ff);
229
230
231 p = sub(p, bit_and(shift_r<1>(p), m55));
232 p = add(bit_and(p, m33), bit_and(shift_r<2>(p), m33));
233 p = bit_and(add(p, shift_r<4>(p)), m0f);
234 p = add(p, shift_r<8>(p));
235 p = add(p, shift_r<16>(p));
236 p = add(p, shift_r<32>(p));
237 p = bit_and(p, res_mask);
238 return p;
239 }
240
241 static SIMDPP_INL
i_popcnt(const uint64<2> & a)242 uint64<2> i_popcnt(const uint64<2>& a)
243 {
244 #if SIMDPP_USE_NULL
245 uint64<2> r;
246 for (unsigned i = 0; i < a.length; i++) {
247 r.el(i) = detail::null::el_popcnt64(a.el(i));
248 }
249 return r;
250 #elif SIMDPP_USE_X86_POPCNT_INSN
251 unsigned a0, a1;
252 #if SIMDPP_64_BITS
253 a0 = _mm_popcnt_u64(extract<0>(a));
254 a1 = _mm_popcnt_u64(extract<1>(a));
255 #else
256 uint32<4> a32; a32 = a;
257 a0 = _mm_popcnt_u32(extract<0>(a32));
258 a0 += _mm_popcnt_u32(extract<1>(a32));
259 a1 = _mm_popcnt_u32(extract<2>(a32));
260 a1 += _mm_popcnt_u32(extract<3>(a32));
261 #endif
262 uint16<8> r = _mm_cvtsi32_si128(a0);
263 r = insert<4>(r, a1);
264 return (uint64<2>) r;
265 #elif SIMDPP_USE_SSE2
266 uint8<16> p8 = v_emul_popcnt_u8((uint8<16>) a);
267 return _mm_sad_epu8(p8.native(), _mm_setzero_si128());
268 #elif SIMDPP_USE_NEON
269 uint8x16_t p8 = vcntq_u8(vreinterpretq_u8_u64(a.native()));
270 uint16x8_t p16 = vpaddlq_u8(p8);
271 uint32x4_t p32 = vpaddlq_u16(p16);
272 return vpaddlq_u32(p32);
273 #elif SIMDPP_USE_VSX_207
274 return vec_vpopcnt(a.native());
275 #elif SIMDPP_USE_MSA
276 return (v2u64) __msa_pcnt_d((v2i64) a.native());
277 #else
278 return v_emul_popcnt_u64(a);
279 #endif
280 }
281
282 #if SIMDPP_USE_AVX2
283 static SIMDPP_INL
i_popcnt(const uint64<4> & a)284 uint64<4> i_popcnt(const uint64<4>& a)
285 {
286 #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
287 uint64<2> a0, a1;
288 split(a, a0, a1);
289 a0 = i_popcnt(a0);
290 a1 = i_popcnt(a1);
291 return combine(a0, a1);
292 #else
293 uint8<32> p8 = v_emul_popcnt_u8((uint8<32>) a);
294 return _mm256_sad_epu8(p8.native(), _mm256_setzero_si256());
295 #endif
296 }
297 #endif
298
299 #if SIMDPP_USE_AVX512F
300 static SIMDPP_INL
i_popcnt(const uint64<8> & a)301 uint64<8> i_popcnt(const uint64<8>& a)
302 {
303 // TODO: support AVX512VPOPCNTDQ
304 #if SIMDPP_USE_AVX512BW
305 uint8<64> p8 = v_emul_popcnt_u8((uint8<64>) a);
306 return _mm512_sad_epu8(p8.native(), _mm512_setzero_si512());
307 #else
308 return v_emul_popcnt_u64(a);
309 #endif
310 }
311 #endif
312
313 // -----------------------------------------------------------------------------
314
315 template<class V> SIMDPP_INL
i_popcnt(const V & a)316 V i_popcnt(const V& a)
317 {
318 SIMDPP_VEC_ARRAY_IMPL1(V, i_popcnt, a)
319 }
320
321
322 } // namespace insn
323 } // namespace detail
324 } // namespace SIMDPP_ARCH_NAMESPACE
325 } // namespace simdpp
326
327 #endif
328
329