1 ///
2 /// @file  generate_phi.hpp
3 /// @brief The PhiCache class calculates the partial sieve function
4 ///        (a.k.a. Legendre-sum) using the recursive formula:
5 ///        phi(x, a) = phi(x, a - 1) - phi(x / primes[a], a - 1).
6 ///        phi(x, a) counts the numbers <= x that are not divisible
7 ///        by any of the first a primes. The algorithm used is an
8 ///        optimized version of the recursive algorithm described in
9 ///        Tomás Oliveira e Silva's paper [2]. I have added 5
10 ///        optimizations to my implementation which speed up the
11 ///        computation by several orders of magnitude.
12 ///
13 ///    [1] In-depth description of primecount's phi(x, a) implementation:
14 ///        https://github.com/kimwalisch/primecount/blob/master/doc/Partial-Sieve-Function.md
15 ///    [2] Tomás Oliveira e Silva, Computing pi(x): the combinatorial
16 ///        method, Revista do DETUA, vol. 4, no. 6, March 2006, p. 761.
17 ///        http://sweet.ua.pt/tos/bib/5.4.pdf
18 ///
19 /// Copyright (C) 2021 Kim Walisch, <kim.walisch@gmail.com>
20 ///
21 /// This file is distributed under the BSD License. See the COPYING
22 /// file in the top level directory.
23 ///
24 
25 #ifndef GENERATE_PHI_HPP
26 #define GENERATE_PHI_HPP
27 
28 #include <primecount-internal.hpp>
29 #include <BitSieve240.hpp>
30 #include <fast_div.hpp>
31 #include <imath.hpp>
32 #include <min.hpp>
33 #include <PhiTiny.hpp>
34 #include <PiTable.hpp>
35 #include <pod_vector.hpp>
36 #include <popcnt.hpp>
37 
38 #include <stdint.h>
39 #include <cassert>
40 #include <utility>
41 #include <vector>
42 
43 namespace {
44 
45 using namespace primecount;
46 
47 template <typename Primes>
48 class PhiCache : public BitSieve240
49 {
50 public:
PhiCache(uint64_t x,uint64_t a,const Primes & primes,const PiTable & pi)51   PhiCache(uint64_t x,
52            uint64_t a,
53            const Primes& primes,
54            const PiTable& pi) :
55     primes_(primes),
56     pi_(pi)
57   {
58     // We cache phi(x, a) if a <= max_a.
59     // The value max_a = 100 has been determined empirically
60     // by running benchmarks. Using a smaller or larger
61     // max_a with the same amount of memory (max_megabytes)
62     // decreases the performance.
63     uint64_t max_a = 100;
64 
65     // Make sure we cache only frequently used values
66     a = a - min(a, 30);
67     max_a = min(a, max_a);
68 
69     if (max_a <= PhiTiny::max_a())
70       return;
71 
72     // We cache phi(x, a) if x <= max_x.
73     // The value max_x = sqrt(x) has been determined by running
74     // S2_hard(x) and D(x) benchmarks from 1e12 to 1e21.
75     uint64_t max_x = isqrt(x);
76 
77     // The cache (i.e. the sieve array)
78     // uses at most max_megabytes per thread.
79     uint64_t max_megabytes = 16;
80     uint64_t indexes = max_a - PhiTiny::max_a();
81     uint64_t max_bytes = max_megabytes << 20;
82     uint64_t max_bytes_per_index = max_bytes / indexes;
83     uint64_t numbers_per_byte = 240 / sizeof(sieve_t);
84     uint64_t cache_limit = max_bytes_per_index * numbers_per_byte;
85     max_x = min(max_x, cache_limit);
86     max_x_size_ = ceil_div(max_x, 240);
87 
88     // For tiny computations caching is not worth it
89     if (max_x_size_ < 8)
90       return;
91 
92     // Make sure that there are no uninitialized
93     // bits in the last sieve array element.
94     assert(max_x_size_ > 0);
95     max_x_ = max_x_size_ * 240 - 1;
96     max_a_ = max_a;
97     sieve_.resize(max_a_ + 1);
98   }
99 
100   /// Calculate phi(x, a) using the recursive formula:
101   /// phi(x, a) = phi(x, a - 1) - phi(x / primes[a], a - 1)
102   ///
103   template <int SIGN>
phi(int64_t x,int64_t a)104   int64_t phi(int64_t x, int64_t a)
105   {
106     if (x <= (int64_t) primes_[a])
107       return SIGN;
108     else if (is_phi_tiny(a))
109       return phi_tiny(x, a) * SIGN;
110     else if (is_pix(x, a))
111       return (pi_[x] - a + 1) * SIGN;
112     else if (is_cached(x, a))
113       return phi_cache(x, a) * SIGN;
114 
115     // Cache all small phi(x, i) results with:
116     // x <= max_x && i <= min(a, max_a)
117     init_cache(x, a);
118 
119     int64_t sqrtx = isqrt(x);
120     int64_t c = PhiTiny::get_c(sqrtx);
121     int64_t c_cached = min(a, max_a_cached_);
122     int64_t sum, i;
123 
124     if (c >= c_cached ||
125         !is_cached(x, c_cached))
126       sum = phi_tiny(x, c) * SIGN;
127     else
128     {
129       c = c_cached;
130       assert(c_cached <= a);
131       sum = phi_cache(x, c) * SIGN;
132     }
133 
134     for (i = c + 1; i <= a; i++)
135     {
136       // phi(x / prime[i], i - 1) = 1 if x / prime[i] <= prime[i-1].
137       // However we can do slightly better:
138       // If prime[i] > sqrt(x) and prime[i-1] <= sqrt(x) then
139       // phi(x / prime[i], i - 1) = 1 even if x / prime[i] > prime[i-1].
140       // This works because in this case there is no other prime
141       // inside the interval ]prime[i-1], x / prime[i]].
142       if (primes_[i] > sqrtx)
143         break;
144       int64_t xp = fast_div(x, primes_[i]);
145       if (is_pix(xp, i - 1))
146         break;
147       sum += phi<-SIGN>(xp, i - 1);
148     }
149 
150     for (; i <= a; i++)
151     {
152       if (primes_[i] > sqrtx)
153         break;
154       int64_t xp = fast_div(x, primes_[i]);
155       // if a >= pi(sqrt(x)): phi(x, a) = pi(x) - a + 1
156       // phi(xp, i - 1) = pi(xp) - (i - 1) + 1
157       // phi(xp, i - 1) = pi(xp) - i + 2
158       sum += (pi_[xp] - i + 2) * -SIGN;
159     }
160 
161     // phi(xp, i - 1) = 1 for i in [i, a]
162     sum += (a + 1 - i) * -SIGN;
163     return sum;
164   }
165 
166 private:
167   /// phi(x, a) counts the numbers <= x that are not divisible by any of
168   /// the first a primes. If a >= pi(sqrt(x)) then phi(x, a) counts the
169   /// number of primes <= x, minus the first a primes, plus the number 1.
170   /// Hence if a >= pi(sqrt(x)): phi(x, a) = pi(x) - a + 1.
171   ///
is_pix(uint64_t x,uint64_t a) const172   bool is_pix(uint64_t x, uint64_t a) const
173   {
174     return x < pi_.size() &&
175            x < isquare(primes_[a + 1]);
176   }
177 
is_cached(uint64_t x,uint64_t a) const178   bool is_cached(uint64_t x, uint64_t a) const
179   {
180     return x <= max_x_ &&
181            a <= max_a_cached_;
182   }
183 
phi_cache(uint64_t x,uint64_t a) const184   int64_t phi_cache(uint64_t x, uint64_t a) const
185   {
186     assert(a < sieve_.size());
187     assert(x / 240 < sieve_[a].size());
188 
189     uint64_t count = sieve_[a][x / 240].count;
190     uint64_t bits = sieve_[a][x / 240].bits;
191     uint64_t bitmask = unset_larger_[x % 240];
192     return count + popcnt64(bits & bitmask);
193   }
194 
195   /// Cache phi(x, i) results with: x <= max_x && i <= min(a, max_a).
196   /// Eratosthenes-like sieving algorithm that removes the first a primes
197   /// and their multiples from the sieve array. Additionally this
198   /// algorithm counts the numbers that are not divisible by any of the
199   /// first a primes after sieving has completed. After sieving and
200   /// counting has finished phi(x, a) results can be retrieved from the
201   /// cache in O(1) using the phi_cache(x, a) method.
202   ///
init_cache(uint64_t x,uint64_t a)203   void init_cache(uint64_t x, uint64_t a)
204   {
205     a = min(a, max_a_);
206 
207     if (x > max_x_ ||
208         a <= max_a_cached_)
209       return;
210 
211     uint64_t i = max_a_cached_ + 1;
212     max_a_cached_ = a;
213     i = max(i, 3);
214 
215     for (; i <= a; i++)
216     {
217       // Each bit in the sieve array corresponds to an integer that
218       // is not divisible by 2, 3 and 5. The 8 bits of each byte
219       // correspond to the offsets { 1, 7, 11, 13, 17, 19, 23, 29 }.
220       if (i == 3)
221         sieve_[i].resize(max_x_size_);
222       else
223       {
224         // Initalize phi(x, i) with phi(x, i - 1)
225         if (i - 1 <= PhiTiny::max_a())
226           sieve_[i] = std::move(sieve_[i - 1]);
227         else
228           sieve_[i] = sieve_[i - 1];
229 
230         // Remove prime[i] and its multiples
231         uint64_t prime = primes_[i];
232         if (prime <= max_x_)
233           sieve_[i][prime / 240].bits &= unset_bit_[prime % 240];
234         for (uint64_t n = prime * prime; n <= max_x_; n += prime * 2)
235           sieve_[i][n / 240].bits &= unset_bit_[n % 240];
236 
237         if (i > PhiTiny::max_a())
238         {
239           // Fill an array with the cumulative 1 bit counts.
240           // sieve[i][j] contains the count of numbers < j * 240 that
241           // are not divisible by any of the first i primes.
242           uint64_t count = 0;
243           for (auto& sieve : sieve_[i])
244           {
245             sieve.count = (uint32_t) count;
246             count += popcnt64(sieve.bits);
247           }
248         }
249       }
250     }
251   }
252 
253   uint64_t max_x_ = 0;
254   uint64_t max_x_size_ = 0;
255   uint64_t max_a_cached_ = 0;
256   uint64_t max_a_ = 0;
257 
258   /// Packing sieve_t increases the cache's capacity by 25%
259   /// which improves performance by up to 10%.
260   #pragma pack(push, 1)
261   struct sieve_t
262   {
263     uint32_t count = 0;
264     uint64_t bits = ~0ull;
265   };
266 
267   #pragma pack(pop)
268 
269   /// sieve[a] contains only numbers that are not divisible
270   /// by any of the the first a primes. sieve[a][i].count
271   /// contains the count of numbers < i * 240 that are not
272   /// divisible by any of the first a primes.
273   std::vector<std::vector<sieve_t>> sieve_;
274   const Primes& primes_;
275   const PiTable& pi_;
276 };
277 
278 /// Returns a vector with phi(x, i - 1) values such that
279 /// phi[i] = phi(x, i - 1) for 1 <= i <= a.
280 /// phi(x, a) counts the numbers <= x that are not
281 /// divisible by any of the first a primes.
282 ///
283 template <typename Primes>
284 pod_vector<int64_t>
generate_phi(int64_t x,int64_t a,const Primes & primes,const PiTable & pi)285 generate_phi(int64_t x,
286              int64_t a,
287              const Primes& primes,
288              const PiTable& pi)
289 {
290   int64_t size = a + 1;
291   pod_vector<int64_t> phi(size);
292   phi[0] = 0;
293 
294   if (size > 1)
295   {
296     if ((int64_t) primes[a] > x)
297       a = pi[x];
298 
299     phi[1] = x;
300     int64_t i = 2;
301     int64_t sqrtx = isqrt(x);
302     PhiCache<Primes> cache(x, a, primes, pi);
303 
304     // 2 <= i <= pi(sqrt(x)) + 1
305     for (; i <= a && primes[i - 1] <= sqrtx; i++)
306       phi[i] = phi[i - 1] + cache.template phi<-1>(x / primes[i - 1], i - 2);
307 
308     // pi(sqrt(x)) + 1 < i <= a
309     for (; i <= a; i++)
310       phi[i] = phi[i - 1] - (x > 0);
311 
312     // a < i < size
313     for (; i < size; i++)
314       phi[i] = x > 0;
315   }
316 
317   return phi;
318 }
319 
320 } // namespace
321 
322 #endif
323