1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 // Barrett division, finding the inverse with Newton's method.
6 // Reference: "Fast Division of Large Integers" by Karl Hasselström,
7 // found at https://treskal.com/s/masters-thesis.pdf
8 
9 // Many thanks to Karl Wiberg, k@w5.se, for both writing up an
10 // understandable theoretical description of the algorithm and privately
11 // providing a demo implementation, on which the implementation in this file is
12 // based.
13 
14 #include <algorithm>
15 
16 #include "src/bigint/bigint-internal.h"
17 #include "src/bigint/digit-arithmetic.h"
18 #include "src/bigint/div-helpers.h"
19 #include "src/bigint/vector-arithmetic.h"
20 
21 namespace v8 {
22 namespace bigint {
23 
24 namespace {
25 
DcheckIntegerPartRange(Digits X,digit_t min,digit_t max)26 void DcheckIntegerPartRange(Digits X, digit_t min, digit_t max) {
27 #if DEBUG
28   digit_t integer_part = X.msd();
29   DCHECK(integer_part >= min);
30   DCHECK(integer_part <= max);
31 #else
32   USE(X);
33   USE(min);
34   USE(max);
35 #endif
36 }
37 
38 }  // namespace
39 
40 // Z := (the fractional part of) 1/V, via naive division.
41 // See comments at {Invert} and {InvertNewton} below for details.
InvertBasecase(RWDigits Z,Digits V,RWDigits scratch)42 void ProcessorImpl::InvertBasecase(RWDigits Z, Digits V, RWDigits scratch) {
43   DCHECK(Z.len() > V.len());
44   DCHECK(V.len() > 0);  // NOLINT(readability/check)
45   DCHECK(scratch.len() >= 2 * V.len());
46   int n = V.len();
47   RWDigits X(scratch, 0, 2 * n);
48   digit_t borrow = 0;
49   int i = 0;
50   for (; i < n; i++) X[i] = 0;
51   for (; i < 2 * n; i++) X[i] = digit_sub2(0, V[i - n], borrow, &borrow);
52   DCHECK(borrow == 1);     // NOLINT(readability/check)
53   RWDigits R(nullptr, 0);  // We don't need the remainder.
54   if (n < kBurnikelThreshold) {
55     DivideSchoolbook(Z, R, X, V);
56   } else {
57     DivideBurnikelZiegler(Z, R, X, V);
58   }
59 }
60 
61 // This is Algorithm 4.2 from the paper.
62 // Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
63 // V.len+1 digits. The V.len low digits of the result digits will be written
64 // to Z, plus there is an implicit top digit with value 1.
65 // Needs InvertNewtonScratchSpace(V.len) of scratch space.
66 // The result is either correct or off by one (about half the time it is
67 // correct, half the time it is one too much, and in the corner case where V is
68 // minimal and the implicit top digit would have to be 2 it is one too little).
69 // Barrett's division algorithm can handle that, so we don't care.
InvertNewton(RWDigits Z,Digits V,RWDigits scratch)70 void ProcessorImpl::InvertNewton(RWDigits Z, Digits V, RWDigits scratch) {
71   const int vn = V.len();
72   DCHECK(Z.len() >= vn);
73   DCHECK(scratch.len() >= InvertNewtonScratchSpace(vn));
74   const int kSOffset = 0;
75   const int kWOffset = 0;  // S and W can share their scratch space.
76   const int kUOffset = vn + kInvertNewtonExtraSpace;
77 
78   // The base case won't work otherwise.
79   DCHECK(V.len() >= 3);  // NOLINT(readability/check)
80 
81   constexpr int kBasecasePrecision = kNewtonInversionThreshold - 1;
82   // V must have more digits than the basecase.
83   DCHECK(V.len() > kBasecasePrecision);
84   DCHECK(IsBitNormalized(V));
85 
86   // Step (1): Setup.
87   // Calculate precision required at each step.
88   // {k} is the number of fraction bits for the current iteration.
89   int k = vn * kDigitBits;
90   int target_fraction_bits[8 * sizeof(vn)];  // "k_i" in the paper.
91   int iteration = -1;  // "i" in the paper, except inverted to run downwards.
92   while (k > kBasecasePrecision * kDigitBits) {
93     iteration++;
94     target_fraction_bits[iteration] = k;
95     k = DIV_CEIL(k, 2);
96   }
97   // At this point, k <= kBasecasePrecision*kDigitBits is the number of
98   // fraction bits to use in the base case. {iteration} is the highest index
99   // in use for f[].
100 
101   // Step (2): Initial approximation.
102   int initial_digits = DIV_CEIL(k + 1, kDigitBits);
103   Digits top_part_of_v(V, vn - initial_digits, initial_digits);
104   InvertBasecase(Z, top_part_of_v, scratch);
105   Z[initial_digits] = Z[initial_digits] + 1;  // Implicit top digit.
106   // From now on, we'll keep Z.len updated to the part that's already computed.
107   Z.set_len(initial_digits + 1);
108 
109   // Step (3): Precision doubling loop.
110   while (true) {
111     DcheckIntegerPartRange(Z, 1, 2);
112 
113     // (3b): S = Z^2
114     RWDigits S(scratch, kSOffset, 2 * Z.len());
115     Multiply(S, Z, Z);
116     if (should_terminate()) return;
117     S.TrimOne();  // Top digit of S is unused.
118     DcheckIntegerPartRange(S, 1, 4);
119 
120     // (3c): T = V, truncated so that at least 2k+3 fraction bits remain.
121     int fraction_digits = DIV_CEIL(2 * k + 3, kDigitBits);
122     int t_len = std::min(V.len(), fraction_digits);
123     Digits T(V, V.len() - t_len, t_len);
124 
125     // (3d): U = T * S, truncated so that at least 2k+1 fraction bits remain
126     // (U has one integer digit, which might be zero).
127     fraction_digits = DIV_CEIL(2 * k + 1, kDigitBits);
128     RWDigits U(scratch, kUOffset, S.len() + T.len());
129     DCHECK(U.len() > fraction_digits);
130     Multiply(U, S, T);
131     if (should_terminate()) return;
132     U = U + (U.len() - (1 + fraction_digits));
133     DcheckIntegerPartRange(U, 0, 3);
134 
135     // (3e): W = 2 * Z, padded with "0" fraction bits so that it has the
136     // same number of fraction bits as U.
137     DCHECK(U.len() >= Z.len());
138     RWDigits W(scratch, kWOffset, U.len());
139     int padding_digits = U.len() - Z.len();
140     for (int i = 0; i < padding_digits; i++) W[i] = 0;
141     LeftShift(W + padding_digits, Z, 1);
142     DcheckIntegerPartRange(W, 2, 4);
143 
144     // (3f): Z = W - U.
145     // This check is '<=' instead of '<' because U's top digit is its
146     // integer part, and we want vn fraction digits.
147     if (U.len() <= vn) {
148       // Normal subtraction.
149       // This is not the last iteration.
150       DCHECK(iteration > 0);  // NOLINT(readability/check)
151       Z.set_len(U.len());
152       digit_t borrow = SubtractAndReturnBorrow(Z, W, U);
153       DCHECK(borrow == 0);  // NOLINT(readability/check)
154       USE(borrow);
155       DcheckIntegerPartRange(Z, 1, 2);
156     } else {
157       // Truncate some least significant digits so that we get vn
158       // fraction digits, and compute the integer digit separately.
159       // This is the last iteration.
160       DCHECK(iteration == 0);  // NOLINT(readability/check)
161       Z.set_len(vn);
162       Digits W_part(W, W.len() - vn - 1, vn);
163       Digits U_part(U, U.len() - vn - 1, vn);
164       digit_t borrow = SubtractAndReturnBorrow(Z, W_part, U_part);
165       digit_t integer_part = W.msd() - U.msd() - borrow;
166       DCHECK(integer_part == 1 || integer_part == 2);
167       if (integer_part == 2) {
168         // This is the rare case where the correct result would be 2.0, but
169         // since we can't express that by returning only the fractional part
170         // with an implicit 1-digit, we have to return [1.]9999... instead.
171         for (int i = 0; i < Z.len(); i++) Z[i] = ~digit_t{0};
172       }
173       break;
174     }
175     // (3g, 3h): Update local variables and loop.
176     k = target_fraction_bits[iteration];
177     iteration--;
178   }
179 }
180 
181 // Computes the inverse of V, shifted by kDigitBits * 2 * V.len, accurate to
182 // V.len+1 digits. The V.len low digits of the result digits will be written
183 // to Z, plus there is an implicit top digit with value 1.
184 // (Corner case: if V is minimal, the implicit digit should be 2; in that case
185 // we return one less than the correct answer. DivideBarrett can handle that.)
186 // Needs InvertScratchSpace(V.len) digits of scratch space.
Invert(RWDigits Z,Digits V,RWDigits scratch)187 void ProcessorImpl::Invert(RWDigits Z, Digits V, RWDigits scratch) {
188   DCHECK(Z.len() > V.len());
189   DCHECK(V.len() >= 1);  // NOLINT(readability/check)
190   DCHECK(IsBitNormalized(V));
191   DCHECK(scratch.len() >= InvertScratchSpace(V.len()));
192 
193   int vn = V.len();
194   if (vn >= kNewtonInversionThreshold) {
195     return InvertNewton(Z, V, scratch);
196   }
197   if (vn == 1) {
198     digit_t d = V[0];
199     digit_t dummy_remainder;
200     Z[0] = digit_div(~d, ~digit_t{0}, d, &dummy_remainder);
201     Z[1] = 0;
202   } else {
203     InvertBasecase(Z, V, scratch);
204     if (Z[vn] == 1) {
205       for (int i = 0; i < vn; i++) Z[i] = ~digit_t{0};
206       Z[vn] = 0;
207     }
208   }
209 }
210 
211 // This is algorithm 3.5 from the paper.
212 // Computes Q(uotient) and R(emainder) for A/B using I, which is a
213 // precomputed approximation of 1/B (e.g. with Invert() above).
214 // Needs DivideBarrettScratchSpace(A.len) scratch space.
DivideBarrett(RWDigits Q,RWDigits R,Digits A,Digits B,Digits I,RWDigits scratch)215 void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B,
216                                   Digits I, RWDigits scratch) {
217   DCHECK(Q.len() > A.len() - B.len());
218   DCHECK(R.len() >= B.len());
219   DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
220   DCHECK(A.len() <= 2 * B.len());
221   DCHECK(B.len() > 0);  // NOLINT(readability/check)
222   DCHECK(IsBitNormalized(B));
223   DCHECK(I.len() == A.len() - B.len());
224   DCHECK(scratch.len() >= DivideBarrettScratchSpace(A.len()));
225 
226   int orig_q_len = Q.len();
227 
228   // (1): A1 = A with B.len fewer digits.
229   Digits A1 = A + B.len();
230   DCHECK(A1.len() == I.len());
231 
232   // (2): Q = A1*I with I.len fewer digits.
233   // {I} has an implicit high digit with value 1, so we add {A1} to the high
234   // part of the multiplication result.
235   RWDigits K(scratch, 0, 2 * I.len());
236   Multiply(K, A1, I);
237   if (should_terminate()) return;
238   Q.set_len(I.len() + 1);
239   Add(Q, K + I.len(), A1);
240   // K is no longer used, can re-use {scratch} for P.
241 
242   // (3): R = A - B*Q (approximate remainder).
243   RWDigits P(scratch, 0, A.len() + 1);
244   Multiply(P, B, Q);
245   if (should_terminate()) return;
246   digit_t borrow = SubtractAndReturnBorrow(R, A, Digits(P, 0, B.len()));
247   // R may be allocated wider than B, zero out any extra digits if so.
248   for (int i = B.len(); i < R.len(); i++) R[i] = 0;
249   digit_t r_high = A[B.len()] - P[B.len()] - borrow;
250 
251   // Adjust R and Q so that they become the correct remainder and quotient.
252   // The number of iterations is guaranteed to be at most some very small
253   // constant, unless the caller gave us a bad approximate quotient.
254   if (r_high >> (kDigitBits - 1) == 1) {
255     // (5b): R < 0, so R += B
256     digit_t q_sub = 0;
257     do {
258       r_high += AddAndReturnCarry(R, R, B);
259       q_sub++;
260       DCHECK(q_sub <= 5);  // NOLINT(readability/check)
261     } while (r_high != 0);
262     Subtract(Q, q_sub);
263   } else {
264     digit_t q_add = 0;
265     while (r_high != 0 || GreaterThanOrEqual(R, B)) {
266       // (5c): R >= B, so R -= B
267       r_high -= SubtractAndReturnBorrow(R, R, B);
268       q_add++;
269       DCHECK(q_add <= 5);  // NOLINT(readability/check)
270     }
271     Add(Q, q_add);
272   }
273   // (5a): Return.
274   int final_q_len = Q.len();
275   Q.set_len(orig_q_len);
276   for (int i = final_q_len; i < orig_q_len; i++) Q[i] = 0;
277 }
278 
279 // Computes Q(uotient) and R(emainder) for A/B, using Barrett division.
DivideBarrett(RWDigits Q,RWDigits R,Digits A,Digits B)280 void ProcessorImpl::DivideBarrett(RWDigits Q, RWDigits R, Digits A, Digits B) {
281   DCHECK(Q.len() > A.len() - B.len());
282   DCHECK(R.len() >= B.len());
283   DCHECK(A.len() > B.len());  // Careful: This is *not* '>=' !
284   DCHECK(B.len() > 0);        // NOLINT(readability/check)
285 
286   // Normalize B, and shift A by the same amount.
287   ShiftedDigits b_normalized(B);
288   ShiftedDigits a_normalized(A, b_normalized.shift());
289   // Keep the code below more concise.
290   B = b_normalized;
291   A = a_normalized;
292 
293   // The core DivideBarrett function above only supports A having at most
294   // twice as many digits as B. We generalize this to arbitrary inputs
295   // similar to Burnikel-Ziegler division by performing a t-by-1 division
296   // of B-sized chunks. It's easy to special-case the situation where we
297   // don't need to bother.
298   int barrett_dividend_length = A.len() <= 2 * B.len() ? A.len() : 2 * B.len();
299   int i_len = barrett_dividend_length - B.len();
300   ScratchDigits I(i_len + 1);  // +1 is for temporary use by Invert().
301   int scratch_len =
302       std::max(InvertScratchSpace(i_len),
303                DivideBarrettScratchSpace(barrett_dividend_length));
304   ScratchDigits scratch(scratch_len);
305   Invert(I, Digits(B, B.len() - i_len, i_len), scratch);
306   if (should_terminate()) return;
307   I.TrimOne();
308   DCHECK(I.len() == i_len);
309   if (A.len() > 2 * B.len()) {
310     // This follows the variable names and and algorithmic steps of
311     // DivideBurnikelZiegler().
312     int n = B.len();  // Chunk length.
313     // (5): {t} is the number of B-sized chunks of A.
314     int t = DIV_CEIL(A.len(), n);
315     DCHECK(t >= 3);  // NOLINT(readability/check)
316     // (6)/(7): Z is used for the current 2-chunk block to be divided by B,
317     // initialized to the two topmost chunks of A.
318     int z_len = n * 2;
319     ScratchDigits Z(z_len);
320     PutAt(Z, A + n * (t - 2), z_len);
321     // (8): For i from t-2 downto 0 do
322     int qi_len = n + 1;
323     ScratchDigits Qi(qi_len);
324     ScratchDigits Ri(n);
325     // First iteration unrolled and specialized.
326     {
327       int i = t - 2;
328       DivideBarrett(Qi, Ri, Z, B, I, scratch);
329       if (should_terminate()) return;
330       RWDigits target = Q + n * i;
331       // In the first iteration, all qi_len = n + 1 digits may be used.
332       int to_copy = std::min(qi_len, target.len());
333       for (int j = 0; j < to_copy; j++) target[j] = Qi[j];
334       for (int j = to_copy; j < target.len(); j++) target[j] = 0;
335 #if DEBUG
336       for (int j = to_copy; j < Qi.len(); j++) {
337         DCHECK(Qi[j] == 0);  // NOLINT(readability/check)
338       }
339 #endif
340     }
341     // Now loop over any remaining iterations.
342     for (int i = t - 3; i >= 0; i--) {
343       // (8b): If i > 0, set Z_(i-1) = [Ri, A_(i-1)].
344       // (De-duped with unrolled first iteration, hence reading A_(i).)
345       PutAt(Z + n, Ri, n);
346       PutAt(Z, A + n * i, n);
347       // (8a): Compute Qi, Ri such that Zi = B*Qi + Ri.
348       DivideBarrett(Qi, Ri, Z, B, I, scratch);
349       DCHECK(Qi[qi_len - 1] == 0);  // NOLINT(readability/check)
350       if (should_terminate()) return;
351       // (9): Return Q = [Q_(t-2), ..., Q_0]...
352       PutAt(Q + n * i, Qi, n);
353     }
354     Ri.Normalize();
355     DCHECK(Ri.len() <= R.len());
356     // (9): ...and R = R_0 * 2^(-leading_zeros).
357     RightShift(R, Ri, b_normalized.shift());
358   } else {
359     DivideBarrett(Q, R, A, B, I, scratch);
360     if (should_terminate()) return;
361     RightShift(R, R, b_normalized.shift());
362   }
363 }
364 
365 }  // namespace bigint
366 }  // namespace v8
367