1 ///
2 /// @file  Erat.hpp
3 ///
4 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
5 ///
6 /// This file is distributed under the BSD License. See the COPYING
7 /// file in the top level directory.
8 ///
9 
10 #ifndef ERAT_HPP
11 #define ERAT_HPP
12 
13 #include "forward.hpp"
14 #include "EratSmall.hpp"
15 #include "EratMedium.hpp"
16 #include "EratBig.hpp"
17 #include "macros.hpp"
18 
19 #include <stdint.h>
20 #include <array>
21 #include <memory>
22 
23 /// In order convert 1 bits of the sieve array into primes we
24 /// need to quickly calculate the index of the first set bit.
25 /// This CPU instruction is usually named CTZ (Count trailing
26 /// zeros). Unfortunately only x86 and x64 CPUs currently have
27 /// an instruction for that, other CPU architectures like
28 /// ARM64 or PPC64 have to emulate the CTZ instruction using
29 /// multiple other instructions.
30 ///
31 /// On x64 CPUs there are actually 2 instructions to count the
32 /// number of trailing zeros: BSF and TZCNT. BSF is an old
33 /// instruction whereas TZCNT is much more recent (Bit
34 /// Manipulation Instruction Set 1). Since I expect BSF to be
35 /// slow on future x64 CPUs (because it is legacy) we only use
36 /// __builtin_ctzll() if we can guarantee that TZCNT will be
37 /// generated.
38 ///
39 /// There is also a quick, pure integer algorithm known for
40 /// quickly computing the index of the 1st set bit. This
41 /// algorithm is named the "De Bruijn bitscan".
42 /// https://www.chessprogramming.org/BitScan
43 ///
44 /// Because of this situation, we only use __builtin_ctzll()
45 /// or std::countr_zero() when we know that the user's CPU
46 /// architecture can quickly compute CTZ, either using a single
47 /// instruction or emulated using very few instructions. For
48 /// all other CPU architectures we fallback to the "De Bruijn
49 /// bitscan" algorithm.
50 
51 #if !defined(__has_builtin)
52   #define __has_builtin(x) 0
53 #endif
54 
55 #if !defined(__has_include)
56   #define __has_include(x) 0
57 #endif
58 
59 #if __cplusplus >= 202002L && \
60     __has_include(<bit>) && \
61     (defined(__BMI__) /* TZCNT (x64) */ || \
62      defined(__aarch64__) /* CTZ = RBIT + CLZ */ || \
63      defined(_M_ARM64) /* CTZ = RBIT + CLZ */)
64 
65 #include <bit>
66 #define ctz64(x) std::countr_zero(x)
67 
68 #elif __has_builtin(__builtin_ctzll) && \
69     (defined(__BMI__) /* TZCNT (x64) */ || \
70      defined(__aarch64__) /* CTZ = RBIT + CLZ */ || \
71      defined(_M_ARM64) /* CTZ = RBIT + CLZ */)
72 
73 #define ctz64(x) __builtin_ctzll(x)
74 
75 #endif
76 
77 namespace primesieve {
78 
79 class PreSieve;
80 
81 /// The abstract Erat class sieves primes using the segmented sieve
82 /// of Eratosthenes. It uses a bit array for sieving, the bit array
83 /// uses 8 flags for 30 numbers. Erat uses 3 different sieve of
84 /// Eratosthenes algorithms optimized for small, medium and big
85 /// sieving primes to cross-off multiples.
86 ///
87 class Erat
88 {
89 public:
90   uint64_t getSieveSize() const;
91   uint64_t getStop() const;
92 
93 protected:
94   /// Sieve primes >= start_
95   uint64_t start_ = 0;
96   /// Sieve primes <= stop_
97   uint64_t stop_ = 0;
98   /// Size of sieve_ in bytes (power of 2)
99   uint64_t sieveSize_ = 0;
100   /// Lower bound of the current segment
101   uint64_t segmentLow_ = ~0ull;
102   /// Upper bound of the current segment
103   uint64_t segmentHigh_ = 0;
104   /// Sieve of Eratosthenes array
105   uint8_t* sieve_ = nullptr;
106   Erat();
107   Erat(uint64_t, uint64_t);
108   void init(uint64_t, uint64_t, uint64_t, PreSieve&);
109   void addSievingPrime(uint64_t);
110   NOINLINE void sieveSegment();
111   bool hasNextSegment() const;
112   static uint64_t nextPrime(uint64_t, uint64_t);
113 
114 private:
115   uint64_t maxPreSieve_ = 0;
116   uint64_t maxEratSmall_ = 0;
117   uint64_t maxEratMedium_ = 0;
118   std::unique_ptr<uint8_t[]> deleter_;
119   PreSieve* preSieve_ = nullptr;
120   EratSmall eratSmall_;
121   EratBig eratBig_;
122   EratMedium eratMedium_;
123   static uint64_t byteRemainder(uint64_t);
124   uint64_t getL1CacheSize() const;
125   void initSieve(uint64_t);
126   void initErat();
127   void preSieve();
128   void crossOff();
129   void sieveLastSegment();
130 };
131 
132 /// Convert 1st set bit into prime
nextPrime(uint64_t bits,uint64_t low)133 inline uint64_t Erat::nextPrime(uint64_t bits, uint64_t low)
134 {
135 #if defined(ctz64)
136   // Find first set 1 bit
137   auto bitIndex = ctz64(bits);
138   uint64_t bitValue = bitValues[bitIndex];
139   uint64_t prime = low + bitValue;
140   return prime;
141 #else
142   // Fallback if CTZ instruction is not avilable
143   uint64_t debruijn = 0x3F08A4C6ACB9DBDull;
144   uint64_t hash = ((bits ^ (bits - 1)) * debruijn) >> 58;
145   uint64_t bitValue = bruijnBitValues[hash];
146   uint64_t prime = low + bitValue;
147   return prime;
148 #endif
149 }
150 
addSievingPrime(uint64_t prime)151 inline void Erat::addSievingPrime(uint64_t prime)
152 {
153        if (prime > maxEratMedium_)   eratBig_.addSievingPrime(prime, segmentLow_);
154   else if (prime > maxEratSmall_) eratMedium_.addSievingPrime(prime, segmentLow_);
155   else /* (prime > maxPreSieve) */ eratSmall_.addSievingPrime(prime, segmentLow_);
156 }
157 
getStop() const158 inline uint64_t Erat::getStop() const
159 {
160   return stop_;
161 }
162 
163 /// Sieve size in KiB
getSieveSize() const164 inline uint64_t Erat::getSieveSize() const
165 {
166   return sieveSize_ >> 10;
167 }
168 
169 } // namespace
170 
171 #endif
172