1 ///
2 /// @file generate_phi.hpp
3 /// @brief The PhiCache class calculates the partial sieve function
4 /// (a.k.a. Legendre-sum) using the recursive formula:
5 /// phi(x, a) = phi(x, a - 1) - phi(x / primes[a], a - 1).
6 /// phi(x, a) counts the numbers <= x that are not divisible
7 /// by any of the first a primes. The algorithm used is an
8 /// optimized version of the recursive algorithm described in
9 /// Tomás Oliveira e Silva's paper [2]. I have added 5
10 /// optimizations to my implementation which speed up the
11 /// computation by several orders of magnitude.
12 ///
13 /// [1] In-depth description of primecount's phi(x, a) implementation:
14 /// https://github.com/kimwalisch/primecount/blob/master/doc/Partial-Sieve-Function.md
15 /// [2] Tomás Oliveira e Silva, Computing pi(x): the combinatorial
16 /// method, Revista do DETUA, vol. 4, no. 6, March 2006, p. 761.
17 /// http://sweet.ua.pt/tos/bib/5.4.pdf
18 ///
19 /// Copyright (C) 2021 Kim Walisch, <kim.walisch@gmail.com>
20 ///
21 /// This file is distributed under the BSD License. See the COPYING
22 /// file in the top level directory.
23 ///
24
25 #ifndef GENERATE_PHI_HPP
26 #define GENERATE_PHI_HPP
27
28 #include <primecount-internal.hpp>
29 #include <BitSieve240.hpp>
30 #include <fast_div.hpp>
31 #include <imath.hpp>
32 #include <min.hpp>
33 #include <PhiTiny.hpp>
34 #include <PiTable.hpp>
35 #include <pod_vector.hpp>
36 #include <popcnt.hpp>
37
38 #include <stdint.h>
39 #include <cassert>
40 #include <utility>
41 #include <vector>
42
43 namespace {
44
45 using namespace primecount;
46
47 template <typename Primes>
48 class PhiCache : public BitSieve240
49 {
50 public:
PhiCache(uint64_t x,uint64_t a,const Primes & primes,const PiTable & pi)51 PhiCache(uint64_t x,
52 uint64_t a,
53 const Primes& primes,
54 const PiTable& pi) :
55 primes_(primes),
56 pi_(pi)
57 {
58 // We cache phi(x, a) if a <= max_a.
59 // The value max_a = 100 has been determined empirically
60 // by running benchmarks. Using a smaller or larger
61 // max_a with the same amount of memory (max_megabytes)
62 // decreases the performance.
63 uint64_t max_a = 100;
64
65 // Make sure we cache only frequently used values
66 a = a - min(a, 30);
67 max_a = min(a, max_a);
68
69 if (max_a <= PhiTiny::max_a())
70 return;
71
72 // We cache phi(x, a) if x <= max_x.
73 // The value max_x = sqrt(x) has been determined by running
74 // S2_hard(x) and D(x) benchmarks from 1e12 to 1e21.
75 uint64_t max_x = isqrt(x);
76
77 // The cache (i.e. the sieve array)
78 // uses at most max_megabytes per thread.
79 uint64_t max_megabytes = 16;
80 uint64_t indexes = max_a - PhiTiny::max_a();
81 uint64_t max_bytes = max_megabytes << 20;
82 uint64_t max_bytes_per_index = max_bytes / indexes;
83 uint64_t numbers_per_byte = 240 / sizeof(sieve_t);
84 uint64_t cache_limit = max_bytes_per_index * numbers_per_byte;
85 max_x = min(max_x, cache_limit);
86 max_x_size_ = ceil_div(max_x, 240);
87
88 // For tiny computations caching is not worth it
89 if (max_x_size_ < 8)
90 return;
91
92 // Make sure that there are no uninitialized
93 // bits in the last sieve array element.
94 assert(max_x_size_ > 0);
95 max_x_ = max_x_size_ * 240 - 1;
96 max_a_ = max_a;
97 sieve_.resize(max_a_ + 1);
98 }
99
100 /// Calculate phi(x, a) using the recursive formula:
101 /// phi(x, a) = phi(x, a - 1) - phi(x / primes[a], a - 1)
102 ///
103 template <int SIGN>
phi(int64_t x,int64_t a)104 int64_t phi(int64_t x, int64_t a)
105 {
106 if (x <= (int64_t) primes_[a])
107 return SIGN;
108 else if (is_phi_tiny(a))
109 return phi_tiny(x, a) * SIGN;
110 else if (is_pix(x, a))
111 return (pi_[x] - a + 1) * SIGN;
112 else if (is_cached(x, a))
113 return phi_cache(x, a) * SIGN;
114
115 // Cache all small phi(x, i) results with:
116 // x <= max_x && i <= min(a, max_a)
117 init_cache(x, a);
118
119 int64_t sqrtx = isqrt(x);
120 int64_t c = PhiTiny::get_c(sqrtx);
121 int64_t c_cached = min(a, max_a_cached_);
122 int64_t sum, i;
123
124 if (c >= c_cached ||
125 !is_cached(x, c_cached))
126 sum = phi_tiny(x, c) * SIGN;
127 else
128 {
129 c = c_cached;
130 assert(c_cached <= a);
131 sum = phi_cache(x, c) * SIGN;
132 }
133
134 for (i = c + 1; i <= a; i++)
135 {
136 // phi(x / prime[i], i - 1) = 1 if x / prime[i] <= prime[i-1].
137 // However we can do slightly better:
138 // If prime[i] > sqrt(x) and prime[i-1] <= sqrt(x) then
139 // phi(x / prime[i], i - 1) = 1 even if x / prime[i] > prime[i-1].
140 // This works because in this case there is no other prime
141 // inside the interval ]prime[i-1], x / prime[i]].
142 if (primes_[i] > sqrtx)
143 break;
144 int64_t xp = fast_div(x, primes_[i]);
145 if (is_pix(xp, i - 1))
146 break;
147 sum += phi<-SIGN>(xp, i - 1);
148 }
149
150 for (; i <= a; i++)
151 {
152 if (primes_[i] > sqrtx)
153 break;
154 int64_t xp = fast_div(x, primes_[i]);
155 // if a >= pi(sqrt(x)): phi(x, a) = pi(x) - a + 1
156 // phi(xp, i - 1) = pi(xp) - (i - 1) + 1
157 // phi(xp, i - 1) = pi(xp) - i + 2
158 sum += (pi_[xp] - i + 2) * -SIGN;
159 }
160
161 // phi(xp, i - 1) = 1 for i in [i, a]
162 sum += (a + 1 - i) * -SIGN;
163 return sum;
164 }
165
166 private:
167 /// phi(x, a) counts the numbers <= x that are not divisible by any of
168 /// the first a primes. If a >= pi(sqrt(x)) then phi(x, a) counts the
169 /// number of primes <= x, minus the first a primes, plus the number 1.
170 /// Hence if a >= pi(sqrt(x)): phi(x, a) = pi(x) - a + 1.
171 ///
is_pix(uint64_t x,uint64_t a) const172 bool is_pix(uint64_t x, uint64_t a) const
173 {
174 return x < pi_.size() &&
175 x < isquare(primes_[a + 1]);
176 }
177
is_cached(uint64_t x,uint64_t a) const178 bool is_cached(uint64_t x, uint64_t a) const
179 {
180 return x <= max_x_ &&
181 a <= max_a_cached_;
182 }
183
phi_cache(uint64_t x,uint64_t a) const184 int64_t phi_cache(uint64_t x, uint64_t a) const
185 {
186 assert(a < sieve_.size());
187 assert(x / 240 < sieve_[a].size());
188
189 uint64_t count = sieve_[a][x / 240].count;
190 uint64_t bits = sieve_[a][x / 240].bits;
191 uint64_t bitmask = unset_larger_[x % 240];
192 return count + popcnt64(bits & bitmask);
193 }
194
195 /// Cache phi(x, i) results with: x <= max_x && i <= min(a, max_a).
196 /// Eratosthenes-like sieving algorithm that removes the first a primes
197 /// and their multiples from the sieve array. Additionally this
198 /// algorithm counts the numbers that are not divisible by any of the
199 /// first a primes after sieving has completed. After sieving and
200 /// counting has finished phi(x, a) results can be retrieved from the
201 /// cache in O(1) using the phi_cache(x, a) method.
202 ///
init_cache(uint64_t x,uint64_t a)203 void init_cache(uint64_t x, uint64_t a)
204 {
205 a = min(a, max_a_);
206
207 if (x > max_x_ ||
208 a <= max_a_cached_)
209 return;
210
211 uint64_t i = max_a_cached_ + 1;
212 max_a_cached_ = a;
213 i = max(i, 3);
214
215 for (; i <= a; i++)
216 {
217 // Each bit in the sieve array corresponds to an integer that
218 // is not divisible by 2, 3 and 5. The 8 bits of each byte
219 // correspond to the offsets { 1, 7, 11, 13, 17, 19, 23, 29 }.
220 if (i == 3)
221 sieve_[i].resize(max_x_size_);
222 else
223 {
224 // Initalize phi(x, i) with phi(x, i - 1)
225 if (i - 1 <= PhiTiny::max_a())
226 sieve_[i] = std::move(sieve_[i - 1]);
227 else
228 sieve_[i] = sieve_[i - 1];
229
230 // Remove prime[i] and its multiples
231 uint64_t prime = primes_[i];
232 if (prime <= max_x_)
233 sieve_[i][prime / 240].bits &= unset_bit_[prime % 240];
234 for (uint64_t n = prime * prime; n <= max_x_; n += prime * 2)
235 sieve_[i][n / 240].bits &= unset_bit_[n % 240];
236
237 if (i > PhiTiny::max_a())
238 {
239 // Fill an array with the cumulative 1 bit counts.
240 // sieve[i][j] contains the count of numbers < j * 240 that
241 // are not divisible by any of the first i primes.
242 uint64_t count = 0;
243 for (auto& sieve : sieve_[i])
244 {
245 sieve.count = (uint32_t) count;
246 count += popcnt64(sieve.bits);
247 }
248 }
249 }
250 }
251 }
252
253 uint64_t max_x_ = 0;
254 uint64_t max_x_size_ = 0;
255 uint64_t max_a_cached_ = 0;
256 uint64_t max_a_ = 0;
257
258 /// Packing sieve_t increases the cache's capacity by 25%
259 /// which improves performance by up to 10%.
260 #pragma pack(push, 1)
261 struct sieve_t
262 {
263 uint32_t count = 0;
264 uint64_t bits = ~0ull;
265 };
266
267 #pragma pack(pop)
268
269 /// sieve[a] contains only numbers that are not divisible
270 /// by any of the the first a primes. sieve[a][i].count
271 /// contains the count of numbers < i * 240 that are not
272 /// divisible by any of the first a primes.
273 std::vector<std::vector<sieve_t>> sieve_;
274 const Primes& primes_;
275 const PiTable& pi_;
276 };
277
278 /// Returns a vector with phi(x, i - 1) values such that
279 /// phi[i] = phi(x, i - 1) for 1 <= i <= a.
280 /// phi(x, a) counts the numbers <= x that are not
281 /// divisible by any of the first a primes.
282 ///
283 template <typename Primes>
284 pod_vector<int64_t>
generate_phi(int64_t x,int64_t a,const Primes & primes,const PiTable & pi)285 generate_phi(int64_t x,
286 int64_t a,
287 const Primes& primes,
288 const PiTable& pi)
289 {
290 int64_t size = a + 1;
291 pod_vector<int64_t> phi(size);
292 phi[0] = 0;
293
294 if (size > 1)
295 {
296 if ((int64_t) primes[a] > x)
297 a = pi[x];
298
299 phi[1] = x;
300 int64_t i = 2;
301 int64_t sqrtx = isqrt(x);
302 PhiCache<Primes> cache(x, a, primes, pi);
303
304 // 2 <= i <= pi(sqrt(x)) + 1
305 for (; i <= a && primes[i - 1] <= sqrtx; i++)
306 phi[i] = phi[i - 1] + cache.template phi<-1>(x / primes[i - 1], i - 2);
307
308 // pi(sqrt(x)) + 1 < i <= a
309 for (; i <= a; i++)
310 phi[i] = phi[i - 1] - (x > 0);
311
312 // a < i < size
313 for (; i < size; i++)
314 phi[i] = x > 0;
315 }
316
317 return phi;
318 }
319
320 } // namespace
321
322 #endif
323