1 /*  Copyright (C) 2017  Povilas Kanapickas <povilas@radix.lt>
2 
3     Distributed under the Boost Software License, Version 1.0.
4         (See accompanying file LICENSE_1_0.txt or copy at
5             http://www.boost.org/LICENSE_1_0.txt)
6 */
7 
8 #ifndef LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_POPCNT_H
9 #define LIBSIMDPP_SIMDPP_DETAIL_INSN_I_REDUCE_POPCNT_H
10 
11 #ifndef LIBSIMDPP_SIMD_H
12     #error "This file must be included through simd.h"
13 #endif
14 
15 #include <simdpp/types.h>
16 #include <simdpp/detail/null/bitwise.h>
17 #include <simdpp/core/i_popcnt.h>
18 #include <simdpp/core/i_reduce_add.h>
19 
20 namespace simdpp {
21 namespace SIMDPP_ARCH_NAMESPACE {
22 namespace detail {
23 namespace insn {
24 
25 static SIMDPP_INL
i_reduce_popcnt(const uint32<4> & a)26 uint32_t i_reduce_popcnt(const uint32<4>& a)
27 {
28 #if SIMDPP_USE_NULL
29     uint32_t r = 0;
30     for (unsigned i = 0; i < a.length; i++) {
31         r += detail::null::el_popcnt32(a.el(i));
32     }
33     return r;
34 #elif SIMDPP_USE_X86_POPCNT_INSN
35     uint32_t r = 0;
36 #if SIMDPP_64_BITS
37     uint64<2> a64; a64 = a;
38     r += _mm_popcnt_u64(extract<0>(a64));
39     r += _mm_popcnt_u64(extract<1>(a64));
40 #else
41     r += _mm_popcnt_u32(extract<0>(a));
42     r += _mm_popcnt_u32(extract<1>(a));
43     r += _mm_popcnt_u32(extract<2>(a));
44     r += _mm_popcnt_u32(extract<3>(a));
45 #endif
46     return r;
47 #elif SIMDPP_USE_NEON
48     uint8<16> r = vcntq_u8(vreinterpretq_u8_u32(a.native()));
49     return reduce_add(r);
50 #elif SIMDPP_USE_VSX_207 || SIMDPP_USE_MSA
51     uint64<2> a64; a64 = a;
52     a64 = popcnt(a64);
53     return reduce_add(a64);
54 #elif SIMDPP_USE_SSE2
55     uint64<2> r = popcnt((uint64<2>)a);
56     return (uint32_t) reduce_add(r);
57 #else
58     uint32<4> r = popcnt(a);
59     return reduce_add(r);
60 #endif
61 }
62 
63 #if SIMDPP_USE_AVX2
64 static SIMDPP_INL
i_reduce_popcnt(const uint32<8> & a)65 uint32_t i_reduce_popcnt(const uint32<8>& a)
66 {
67 #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
68     uint32<4> a0, a1;
69     split(a, a0, a1);
70     return i_reduce_popcnt(a0) + i_reduce_popcnt(a1);
71 #else
72     uint64<4> r = popcnt((uint64<4>)a);
73     return (uint32_t) reduce_add(r);
74 #endif
75 }
76 #endif
77 
78 #if SIMDPP_USE_AVX512F
79 static SIMDPP_INL
i_reduce_popcnt(const uint32<16> & a)80 uint32_t i_reduce_popcnt(const uint32<16>& a)
81 {
82 #if SIMDPP_USE_X86_POPCNT_INSN && SIMDPP_64_BITS
83     uint32<8> a0, a1;
84     split(a, a0, a1);
85     return i_reduce_popcnt(a0) + i_reduce_popcnt(a1);
86 #else
87     // TODO: support AVX512VPOPCNTDQ
88     uint64<8> r = popcnt((uint64<8>)a);
89     return reduce_add(r);
90 #endif
91 }
92 #endif
93 
94 template<unsigned N> SIMDPP_INL
i_reduce_popcnt(const uint32<N> & a)95 uint32_t i_reduce_popcnt(const uint32<N>& a)
96 {
97     uint32_t r = 0;
98     for (unsigned j = 0; j < a.vec_length; ++j) {
99         r += i_reduce_popcnt(a.vec(j));
100     }
101     return r;
102 }
103 
104 // -----------------------------------------------------------------------------
105 
106 } // namespace insn
107 } // namespace detail
108 } // namespace SIMDPP_ARCH_NAMESPACE
109 } // namespace simdpp
110 
111 #endif
112 
113