1 ///
2 /// @file Erat.hpp
3 ///
4 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
5 ///
6 /// This file is distributed under the BSD License. See the COPYING
7 /// file in the top level directory.
8 ///
9
10 #ifndef ERAT_HPP
11 #define ERAT_HPP
12
13 #include "forward.hpp"
14 #include "EratSmall.hpp"
15 #include "EratMedium.hpp"
16 #include "EratBig.hpp"
17 #include "macros.hpp"
18
19 #include <stdint.h>
20 #include <array>
21 #include <memory>
22
23 /// In order convert 1 bits of the sieve array into primes we
24 /// need to quickly calculate the index of the first set bit.
25 /// This CPU instruction is usually named CTZ (Count trailing
26 /// zeros). Unfortunately only x86 and x64 CPUs currently have
27 /// an instruction for that, other CPU architectures like
28 /// ARM64 or PPC64 have to emulate the CTZ instruction using
29 /// multiple other instructions.
30 ///
31 /// On x64 CPUs there are actually 2 instructions to count the
32 /// number of trailing zeros: BSF and TZCNT. BSF is an old
33 /// instruction whereas TZCNT is much more recent (Bit
34 /// Manipulation Instruction Set 1). Since I expect BSF to be
35 /// slow on future x64 CPUs (because it is legacy) we only use
36 /// __builtin_ctzll() if we can guarantee that TZCNT will be
37 /// generated.
38 ///
39 /// There is also a quick, pure integer algorithm known for
40 /// quickly computing the index of the 1st set bit. This
41 /// algorithm is named the "De Bruijn bitscan".
42 /// https://www.chessprogramming.org/BitScan
43 ///
44 /// Because of this situation, we only use __builtin_ctzll()
45 /// or std::countr_zero() when we know that the user's CPU
46 /// architecture can quickly compute CTZ, either using a single
47 /// instruction or emulated using very few instructions. For
48 /// all other CPU architectures we fallback to the "De Bruijn
49 /// bitscan" algorithm.
50
51 #if !defined(__has_builtin)
52 #define __has_builtin(x) 0
53 #endif
54
55 #if !defined(__has_include)
56 #define __has_include(x) 0
57 #endif
58
59 #if __cplusplus >= 202002L && \
60 __has_include(<bit>) && \
61 (defined(__BMI__) /* TZCNT (x64) */ || \
62 defined(__aarch64__) /* CTZ = RBIT + CLZ */ || \
63 defined(_M_ARM64) /* CTZ = RBIT + CLZ */)
64
65 #include <bit>
66 #define ctz64(x) std::countr_zero(x)
67
68 #elif __has_builtin(__builtin_ctzll) && \
69 (defined(__BMI__) /* TZCNT (x64) */ || \
70 defined(__aarch64__) /* CTZ = RBIT + CLZ */ || \
71 defined(_M_ARM64) /* CTZ = RBIT + CLZ */)
72
73 #define ctz64(x) __builtin_ctzll(x)
74
75 #endif
76
77 namespace primesieve {
78
79 class PreSieve;
80
81 /// The abstract Erat class sieves primes using the segmented sieve
82 /// of Eratosthenes. It uses a bit array for sieving, the bit array
83 /// uses 8 flags for 30 numbers. Erat uses 3 different sieve of
84 /// Eratosthenes algorithms optimized for small, medium and big
85 /// sieving primes to cross-off multiples.
86 ///
87 class Erat
88 {
89 public:
90 uint64_t getSieveSize() const;
91 uint64_t getStop() const;
92
93 protected:
94 /// Sieve primes >= start_
95 uint64_t start_ = 0;
96 /// Sieve primes <= stop_
97 uint64_t stop_ = 0;
98 /// Size of sieve_ in bytes (power of 2)
99 uint64_t sieveSize_ = 0;
100 /// Lower bound of the current segment
101 uint64_t segmentLow_ = ~0ull;
102 /// Upper bound of the current segment
103 uint64_t segmentHigh_ = 0;
104 /// Sieve of Eratosthenes array
105 uint8_t* sieve_ = nullptr;
106 Erat();
107 Erat(uint64_t, uint64_t);
108 void init(uint64_t, uint64_t, uint64_t, PreSieve&);
109 void addSievingPrime(uint64_t);
110 NOINLINE void sieveSegment();
111 bool hasNextSegment() const;
112 static uint64_t nextPrime(uint64_t, uint64_t);
113
114 private:
115 uint64_t maxPreSieve_ = 0;
116 uint64_t maxEratSmall_ = 0;
117 uint64_t maxEratMedium_ = 0;
118 std::unique_ptr<uint8_t[]> deleter_;
119 PreSieve* preSieve_ = nullptr;
120 EratSmall eratSmall_;
121 EratBig eratBig_;
122 EratMedium eratMedium_;
123 static uint64_t byteRemainder(uint64_t);
124 uint64_t getL1CacheSize() const;
125 void initSieve(uint64_t);
126 void initErat();
127 void preSieve();
128 void crossOff();
129 void sieveLastSegment();
130 };
131
132 /// Convert 1st set bit into prime
nextPrime(uint64_t bits,uint64_t low)133 inline uint64_t Erat::nextPrime(uint64_t bits, uint64_t low)
134 {
135 #if defined(ctz64)
136 // Find first set 1 bit
137 auto bitIndex = ctz64(bits);
138 uint64_t bitValue = bitValues[bitIndex];
139 uint64_t prime = low + bitValue;
140 return prime;
141 #else
142 // Fallback if CTZ instruction is not avilable
143 uint64_t debruijn = 0x3F08A4C6ACB9DBDull;
144 uint64_t hash = ((bits ^ (bits - 1)) * debruijn) >> 58;
145 uint64_t bitValue = bruijnBitValues[hash];
146 uint64_t prime = low + bitValue;
147 return prime;
148 #endif
149 }
150
addSievingPrime(uint64_t prime)151 inline void Erat::addSievingPrime(uint64_t prime)
152 {
153 if (prime > maxEratMedium_) eratBig_.addSievingPrime(prime, segmentLow_);
154 else if (prime > maxEratSmall_) eratMedium_.addSievingPrime(prime, segmentLow_);
155 else /* (prime > maxPreSieve) */ eratSmall_.addSievingPrime(prime, segmentLow_);
156 }
157
getStop() const158 inline uint64_t Erat::getStop() const
159 {
160 return stop_;
161 }
162
163 /// Sieve size in KiB
getSieveSize() const164 inline uint64_t Erat::getSieveSize() const
165 {
166 return sieveSize_ >> 10;
167 }
168
169 } // namespace
170
171 #endif
172