1 ///
2 /// @file popcount.cpp
3 /// @brief Quickly count the number of 1 bits in an array.
4 ///
5 /// The "Harley-Seal popcount" algorithm that we use is a pure
6 /// integer algorithm that does not use the POPCNT instruction
7 /// present on many CPU architectures. There are a few reasons
8 /// why we do not use the POPCNT instruction here:
9 ///
10 /// 1) This algorithm is not really a bottleneck.
11 /// 2) This algorithm is portable (unlike POPCNT on x64)
12 /// and very fast, its speed is very close to POPCNT.
13 /// 3) Recent compilers can autovectorize this loop (e.g
14 /// using AVX512 on x64 CPUs) in which case this algorithm
15 /// will even outperform the POPCNT instruction.
16 ///
17 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
18 ///
19 /// This file is distributed under the BSD License. See the COPYING
20 /// file in the top level directory.
21 ///
22
23 #include <stdint.h>
24
25 namespace {
26
27 /// This uses fewer arithmetic operations than any other known
28 /// implementation on machines with fast multiplication.
29 /// It uses 12 arithmetic operations, one of which is a multiply.
30 /// https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
31 ///
popcount64(uint64_t x)32 uint64_t popcount64(uint64_t x)
33 {
34 const uint64_t m1 = 0x5555555555555555ull;
35 const uint64_t m2 = 0x3333333333333333ull;
36 const uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
37 const uint64_t h01 = 0x0101010101010101ull;
38
39 x -= (x >> 1) & m1;
40 x = (x & m2) + ((x >> 2) & m2);
41 x = (x + (x >> 4)) & m4;
42 return (x * h01) >> 56;
43 }
44
45 /// Carry-save adder (CSA).
46 /// @see Chapter 5 in "Hacker's Delight".
47 ///
CSA(uint64_t & h,uint64_t & l,uint64_t a,uint64_t b,uint64_t c)48 void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c)
49 {
50 uint64_t u = a ^ b;
51 h = (a & b) | (u & c);
52 l = u ^ c;
53 }
54
55 } // namespace
56
57 namespace primesieve {
58
59 /// Harley-Seal popcount (4th iteration).
60 /// The Harley-Seal popcount algorithm is one of the fastest algorithms
61 /// for counting 1 bits in an array using only integer operations.
62 /// This implementation uses only 5.69 instructions per 64-bit word.
63 /// @see Chapter 5 in "Hacker's Delight" 2nd edition.
64 ///
popcount(const uint64_t * array,uint64_t size)65 uint64_t popcount(const uint64_t* array, uint64_t size)
66 {
67 uint64_t total = 0;
68 uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0;
69 uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB;
70 uint64_t limit = size - size % 16;
71 uint64_t i = 0;
72
73 for(; i < limit; i += 16)
74 {
75 CSA(twosA, ones, ones, array[i+0], array[i+1]);
76 CSA(twosB, ones, ones, array[i+2], array[i+3]);
77 CSA(foursA, twos, twos, twosA, twosB);
78 CSA(twosA, ones, ones, array[i+4], array[i+5]);
79 CSA(twosB, ones, ones, array[i+6], array[i+7]);
80 CSA(foursB, twos, twos, twosA, twosB);
81 CSA(eightsA,fours, fours, foursA, foursB);
82 CSA(twosA, ones, ones, array[i+8], array[i+9]);
83 CSA(twosB, ones, ones, array[i+10], array[i+11]);
84 CSA(foursA, twos, twos, twosA, twosB);
85 CSA(twosA, ones, ones, array[i+12], array[i+13]);
86 CSA(twosB, ones, ones, array[i+14], array[i+15]);
87 CSA(foursB, twos, twos, twosA, twosB);
88 CSA(eightsB, fours, fours, foursA, foursB);
89 CSA(sixteens, eights, eights, eightsA, eightsB);
90
91 total += popcount64(sixteens);
92 }
93
94 total *= 16;
95 total += 8 * popcount64(eights);
96 total += 4 * popcount64(fours);
97 total += 2 * popcount64(twos);
98 total += 1 * popcount64(ones);
99
100 for(; i < size; i++)
101 total += popcount64(array[i]);
102
103 return total;
104 }
105
106 } // namespace
107