1 ///
2 /// @file   popcount.cpp
3 /// @brief  Quickly count the number of 1 bits in an array.
4 ///
5 ///         The "Harley-Seal popcount" algorithm that we use is a pure
6 ///         integer algorithm that does not use the POPCNT instruction
7 ///         present on many CPU architectures. There are a few reasons
8 ///         why we do not use the POPCNT instruction here:
9 ///
10 ///         1) This algorithm is not really a bottleneck.
11 ///         2) This algorithm is portable (unlike POPCNT on x64)
12 ///            and very fast, its speed is very close to POPCNT.
13 ///         3) Recent compilers can autovectorize this loop (e.g
14 ///            using AVX512 on x64 CPUs) in which case this algorithm
15 ///            will even outperform the POPCNT instruction.
16 ///
17 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
18 ///
19 /// This file is distributed under the BSD License. See the COPYING
20 /// file in the top level directory.
21 ///
22 
23 #include <stdint.h>
24 
25 namespace {
26 
27 /// This uses fewer arithmetic operations than any other known
28 /// implementation on machines with fast multiplication.
29 /// It uses 12 arithmetic operations, one of which is a multiply.
30 /// https://en.wikipedia.org/wiki/Hamming_weight#Efficient_implementation
31 ///
popcount64(uint64_t x)32 uint64_t popcount64(uint64_t x)
33 {
34   const uint64_t m1 = 0x5555555555555555ull;
35   const uint64_t m2 = 0x3333333333333333ull;
36   const uint64_t m4 = 0x0F0F0F0F0F0F0F0Full;
37   const uint64_t h01 = 0x0101010101010101ull;
38 
39   x -= (x >> 1) & m1;
40   x = (x & m2) + ((x >> 2) & m2);
41   x = (x + (x >> 4)) & m4;
42   return (x * h01) >> 56;
43 }
44 
45 /// Carry-save adder (CSA).
46 /// @see Chapter 5 in "Hacker's Delight".
47 ///
CSA(uint64_t & h,uint64_t & l,uint64_t a,uint64_t b,uint64_t c)48 void CSA(uint64_t& h, uint64_t& l, uint64_t a, uint64_t b, uint64_t c)
49 {
50   uint64_t u = a ^ b;
51   h = (a & b) | (u & c);
52   l = u ^ c;
53 }
54 
55 } // namespace
56 
57 namespace primesieve {
58 
59 /// Harley-Seal popcount (4th iteration).
60 /// The Harley-Seal popcount algorithm is one of the fastest algorithms
61 /// for counting 1 bits in an array using only integer operations.
62 /// This implementation uses only 5.69 instructions per 64-bit word.
63 /// @see Chapter 5 in "Hacker's Delight" 2nd edition.
64 ///
popcount(const uint64_t * array,uint64_t size)65 uint64_t popcount(const uint64_t* array, uint64_t size)
66 {
67   uint64_t total = 0;
68   uint64_t ones = 0, twos = 0, fours = 0, eights = 0, sixteens = 0;
69   uint64_t twosA, twosB, foursA, foursB, eightsA, eightsB;
70   uint64_t limit = size - size % 16;
71   uint64_t i = 0;
72 
73   for(; i < limit; i += 16)
74   {
75     CSA(twosA, ones, ones, array[i+0], array[i+1]);
76     CSA(twosB, ones, ones, array[i+2], array[i+3]);
77     CSA(foursA, twos, twos, twosA, twosB);
78     CSA(twosA, ones, ones, array[i+4], array[i+5]);
79     CSA(twosB, ones, ones, array[i+6], array[i+7]);
80     CSA(foursB, twos, twos, twosA, twosB);
81     CSA(eightsA,fours, fours, foursA, foursB);
82     CSA(twosA, ones, ones, array[i+8], array[i+9]);
83     CSA(twosB, ones, ones, array[i+10], array[i+11]);
84     CSA(foursA, twos, twos, twosA, twosB);
85     CSA(twosA, ones, ones, array[i+12], array[i+13]);
86     CSA(twosB, ones, ones, array[i+14], array[i+15]);
87     CSA(foursB, twos, twos, twosA, twosB);
88     CSA(eightsB, fours, fours, foursA, foursB);
89     CSA(sixteens, eights, eights, eightsA, eightsB);
90 
91     total += popcount64(sixteens);
92   }
93 
94   total *= 16;
95   total += 8 * popcount64(eights);
96   total += 4 * popcount64(fours);
97   total += 2 * popcount64(twos);
98   total += 1 * popcount64(ones);
99 
100   for(; i < size; i++)
101     total += popcount64(array[i]);
102 
103   return total;
104 }
105 
106 } // namespace
107