1 // Copyright 2021 the V8 project authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "src/bigint/bigint-internal.h"
6 #include "src/bigint/vector-arithmetic.h"
7 
8 namespace v8 {
9 namespace bigint {
10 
11 // The classic algorithm: for every part, multiply the accumulator with
12 // the appropriate multiplier, and add the part. O(n²) overall.
FromStringClassic(RWDigits Z,FromStringAccumulator * accumulator)13 void ProcessorImpl::FromStringClassic(RWDigits Z,
14                                       FromStringAccumulator* accumulator) {
15   // We always have at least one part to process.
16   DCHECK(accumulator->stack_parts_used_ > 0);  // NOLINT(readability/check)
17   Z[0] = accumulator->stack_parts_[0];
18   RWDigits already_set(Z, 0, 1);
19   for (int i = 1; i < Z.len(); i++) Z[i] = 0;
20 
21   // The {FromStringAccumulator} uses stack-allocated storage for the first
22   // few parts; if heap storage is used at all then all parts are copied there.
23   int num_stack_parts = accumulator->stack_parts_used_;
24   if (num_stack_parts == 1) return;
25   const std::vector<digit_t>& heap_parts = accumulator->heap_parts_;
26   int num_heap_parts = static_cast<int>(heap_parts.size());
27   // All multipliers are the same, except possibly for the last.
28   const digit_t max_multiplier = accumulator->max_multiplier_;
29 
30   if (num_heap_parts == 0) {
31     for (int i = 1; i < num_stack_parts - 1; i++) {
32       MultiplySingle(Z, already_set, max_multiplier);
33       Add(Z, accumulator->stack_parts_[i]);
34       already_set.set_len(already_set.len() + 1);
35     }
36     MultiplySingle(Z, already_set, accumulator->last_multiplier_);
37     Add(Z, accumulator->stack_parts_[num_stack_parts - 1]);
38     return;
39   }
40   // Parts are stored on the heap.
41   for (int i = 1; i < num_heap_parts - 1; i++) {
42     MultiplySingle(Z, already_set, max_multiplier);
43     Add(Z, accumulator->heap_parts_[i]);
44     already_set.set_len(already_set.len() + 1);
45   }
46   MultiplySingle(Z, already_set, accumulator->last_multiplier_);
47   Add(Z, accumulator->heap_parts_.back());
48 }
49 
50 // The fast algorithm: combine parts in a balanced-binary-tree like order:
51 // Multiply-and-add neighboring pairs of parts, then loop, until only one
52 // part is left. The benefit is that the multiplications will have inputs of
53 // similar sizes, which makes them amenable to fast multiplication algorithms.
54 // We have to do more multiplications than the classic algorithm though,
55 // because we also have to multiply the multipliers.
56 // Optimizations:
57 // - We can skip the multiplier for the first part, because we never need it.
58 // - Most multipliers are the same; we can avoid repeated multiplications and
59 //   just copy the previous result. (In theory we could even de-dupe them, but
60 //   as the parts/multipliers grow, we'll need most of the memory anyway.)
61 //   Copied results are marked with a * below.
62 // - We can re-use memory using a system of three buffers whose usage rotates:
63 //   - one is considered empty, and is overwritten with the new parts,
64 //   - one holds the multipliers (and will be "empty" in the next round), and
65 //   - one initially holds the parts and is overwritten with the new multipliers
66 //   Parts and multipliers both grow in each iteration, and get fewer, so we
67 //   use the space of two adjacent old chunks for one new chunk.
68 //   Since the {heap_parts_} vectors has the right size, and so does the
69 //   result {Z}, we can use that memory, and only need to allocate one scratch
70 //   vector. If the final result ends up in the wrong bucket, we have to copy it
71 //   to the correct one.
72 // - We don't have to keep track of the positions and sizes of the chunks,
73 //   because we can deduce their precise placement from the iteration index.
74 //
75 // Example, assuming digit_t is 4 bits, fitting one decimal digit:
76 // Initial state:
77 // parts_:        1  2  3  4  5  6  7  8  9  0  1  2  3  4  5
78 // multipliers_: 10 10 10 10 10 10 10 10 10 10 10 10 10 10 10
79 // After the first iteration of the outer loop:
80 // parts:         12    34    56    78    90    12    34    5
81 // multipliers:        100  *100  *100  *100  *100  *100   10
82 // After the second iteration:
83 // parts:         1234        5678        9012        345
84 // multipliers:              10000      *10000       1000
85 // After the third iteration:
86 // parts:         12345678                9012345
87 // multipliers:                          10000000
88 // And then there's an obvious last iteration.
FromStringLarge(RWDigits Z,FromStringAccumulator * accumulator)89 void ProcessorImpl::FromStringLarge(RWDigits Z,
90                                     FromStringAccumulator* accumulator) {
91   int num_parts = static_cast<int>(accumulator->heap_parts_.size());
92   DCHECK(num_parts >= 2);  // NOLINT(readability/check)
93   DCHECK(Z.len() >= num_parts);
94   RWDigits parts(accumulator->heap_parts_.data(), num_parts);
95   Storage multipliers_storage(num_parts);
96   RWDigits multipliers(multipliers_storage.get(), num_parts);
97   RWDigits temp(Z, 0, num_parts);
98   // Unrolled and specialized first iteration: part_len == 1, so instead of
99   // Digits sub-vectors we have individual digit_t values, and the multipliers
100   // are known up front.
101   {
102     digit_t max_multiplier = accumulator->max_multiplier_;
103     digit_t last_multiplier = accumulator->last_multiplier_;
104     RWDigits new_parts = temp;
105     RWDigits new_multipliers = parts;
106     int i = 0;
107     for (; i + 1 < num_parts; i += 2) {
108       digit_t p_in = parts[i];
109       digit_t p_in2 = parts[i + 1];
110       digit_t m_in = max_multiplier;
111       digit_t m_in2 = i == num_parts - 2 ? last_multiplier : max_multiplier;
112       // p[j] = p[i] * m[i+1] + p[i+1]
113       digit_t p_high;
114       digit_t p_low = digit_mul(p_in, m_in2, &p_high);
115       digit_t carry;
116       new_parts[i] = digit_add2(p_low, p_in2, &carry);
117       new_parts[i + 1] = p_high + carry;
118       // m[j] = m[i] * m[i+1]
119       if (i > 0) {
120         if (i > 2 && m_in2 != last_multiplier) {
121           new_multipliers[i] = new_multipliers[i - 2];
122           new_multipliers[i + 1] = new_multipliers[i - 1];
123         } else {
124           digit_t m_high;
125           new_multipliers[i] = digit_mul(m_in, m_in2, &m_high);
126           new_multipliers[i + 1] = m_high;
127         }
128       }
129     }
130     // Trailing last part (if {num_parts} was odd).
131     if (i < num_parts) {
132       new_parts[i] = parts[i];
133       new_multipliers[i] = last_multiplier;
134       i += 2;
135     }
136     num_parts = i >> 1;
137     RWDigits new_temp = multipliers;
138     parts = new_parts;
139     multipliers = new_multipliers;
140     temp = new_temp;
141     AddWorkEstimate(num_parts);
142   }
143   int part_len = 2;
144 
145   // Remaining iterations.
146   while (num_parts > 1) {
147     RWDigits new_parts = temp;
148     RWDigits new_multipliers = parts;
149     int new_part_len = part_len * 2;
150     int i = 0;
151     for (; i + 1 < num_parts; i += 2) {
152       int start = i * part_len;
153       Digits p_in(parts, start, part_len);
154       Digits p_in2(parts, start + part_len, part_len);
155       Digits m_in(multipliers, start, part_len);
156       Digits m_in2(multipliers, start + part_len, part_len);
157       RWDigits p_out(new_parts, start, new_part_len);
158       RWDigits m_out(new_multipliers, start, new_part_len);
159       // p[j] = p[i] * m[i+1] + p[i+1]
160       Multiply(p_out, p_in, m_in2);
161       if (should_terminate()) return;
162       digit_t overflow = AddAndReturnOverflow(p_out, p_in2);
163       DCHECK(overflow == 0);  // NOLINT(readability/check)
164       USE(overflow);
165       // m[j] = m[i] * m[i+1]
166       if (i > 0) {
167         bool copied = false;
168         if (i > 2) {
169           int prev_start = (i - 2) * part_len;
170           Digits m_in_prev(multipliers, prev_start, part_len);
171           Digits m_in2_prev(multipliers, prev_start + part_len, part_len);
172           if (Compare(m_in, m_in_prev) == 0 &&
173               Compare(m_in2, m_in2_prev) == 0) {
174             copied = true;
175             Digits m_out_prev(new_multipliers, prev_start, new_part_len);
176             for (int k = 0; k < new_part_len; k++) m_out[k] = m_out_prev[k];
177           }
178         }
179         if (!copied) {
180           Multiply(m_out, m_in, m_in2);
181           if (should_terminate()) return;
182         }
183       }
184     }
185     // Trailing last part (if {num_parts} was odd).
186     if (i < num_parts) {
187       Digits p_in(parts, i * part_len, part_len);
188       Digits m_in(multipliers, i * part_len, part_len);
189       RWDigits p_out(new_parts, i * part_len, new_part_len);
190       RWDigits m_out(new_multipliers, i * part_len, new_part_len);
191       int k = 0;
192       for (; k < p_in.len(); k++) p_out[k] = p_in[k];
193       for (; k < p_out.len(); k++) p_out[k] = 0;
194       k = 0;
195       for (; k < m_in.len(); k++) m_out[k] = m_in[k];
196       for (; k < m_out.len(); k++) m_out[k] = 0;
197       i += 2;
198     }
199     num_parts = i >> 1;
200     part_len = new_part_len;
201     RWDigits new_temp = multipliers;
202     parts = new_parts;
203     multipliers = new_multipliers;
204     temp = new_temp;
205   }
206   // Copy the result to Z, if it doesn't happen to be there already.
207   if (parts.digits() != Z.digits()) {
208     int i = 0;
209     for (; i < parts.len(); i++) Z[i] = parts[i];
210     // Z might be bigger than we requested; be robust towards that.
211     for (; i < Z.len(); i++) Z[i] = 0;
212   }
213 }
214 
215 // Specialized algorithms for power-of-two radixes. Designed to work with
216 // {ParsePowerTwo}: {max_multiplier_} isn't saved, but {radix_} is, and
217 // {last_multiplier_} has special meaning, namely the number of unpopulated bits
218 // in the last part.
219 // For these radixes, {parts} already is a list of correct bit sequences, we
220 // just have to put them together in the right way:
221 // - The parts are currently in reversed order. The highest-index parts[i]
222 //   will go into Z[0].
223 // - All parts, possibly except for the last, are maximally populated.
224 // - A maximally populated part stores a non-fractional number of characters,
225 //   i.e. the largest fitting multiple of {char_bits} of it is populated.
226 // - The populated bits in a part are at the low end.
227 // - The number of unused bits in the last part is stored in
228 //   {accumulator->last_multiplier_}.
229 //
230 // Example: Given the following parts vector, where letters are used to
231 // label bits, bit order is big endian (i.e. [00000101] encodes "5"),
232 // 'x' means "unpopulated", kDigitBits == 8, radix == 8, and char_bits == 3:
233 //
234 //     parts[0] -> [xxABCDEF][xxGHIJKL][xxMNOPQR][xxxxxSTU] <- parts[3]
235 //
236 // We have to assemble the following result:
237 //
238 //         Z[0] -> [NOPQRSTU][FGHIJKLM][xxxABCDE] <- Z[2]
239 //
FromStringBasePowerOfTwo(RWDigits Z,FromStringAccumulator * accumulator)240 void ProcessorImpl::FromStringBasePowerOfTwo(
241     RWDigits Z, FromStringAccumulator* accumulator) {
242   const int num_parts = accumulator->ResultLength();
243   DCHECK(num_parts >= 1);  // NOLINT(readability/check)
244   DCHECK(Z.len() >= num_parts);
245   Digits parts(accumulator->heap_parts_.size() > 0
246                    ? accumulator->heap_parts_.data()
247                    : accumulator->stack_parts_,
248                num_parts);
249   uint8_t radix = accumulator->radix_;
250   DCHECK(radix == 2 || radix == 4 || radix == 8 || radix == 16 || radix == 32);
251   const int char_bits = BitLength(radix - 1);
252   const int unused_last_part_bits =
253       static_cast<int>(accumulator->last_multiplier_);
254   const int unused_part_bits = kDigitBits % char_bits;
255   const int max_part_bits = kDigitBits - unused_part_bits;
256   int z_index = 0;
257   int part_index = num_parts - 1;
258 
259   // If the last part is fully populated, then all parts must be, and we can
260   // simply copy them (in reversed order).
261   if (unused_last_part_bits == 0) {
262     DCHECK(kDigitBits % char_bits == 0);  // NOLINT(readability/check)
263     while (part_index >= 0) {
264       Z[z_index++] = parts[part_index--];
265     }
266     for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
267     return;
268   }
269 
270   // Otherwise we have to shift parts contents around as needed.
271   // Holds the next Z digit that we want to store...
272   digit_t digit = parts[part_index--];
273   // ...and the number of bits (at the right end) we already know.
274   int digit_bits = kDigitBits - unused_last_part_bits;
275   while (part_index >= 0) {
276     // Holds the last part that we read from {parts}...
277     digit_t part;
278     // ...and the number of bits (at the right end) that we haven't used yet.
279     int part_bits;
280     while (digit_bits < kDigitBits) {
281       part = parts[part_index--];
282       part_bits = max_part_bits;
283       digit |= part << digit_bits;
284       int part_shift = kDigitBits - digit_bits;
285       if (part_shift > part_bits) {
286         digit_bits += part_bits;
287         part = 0;
288         part_bits = 0;
289         if (part_index < 0) break;
290       } else {
291         digit_bits = kDigitBits;
292         part >>= part_shift;
293         part_bits -= part_shift;
294       }
295     }
296     Z[z_index++] = digit;
297     digit = part;
298     digit_bits = part_bits;
299   }
300   if (digit_bits > 0) {
301     Z[z_index++] = digit;
302   }
303   for (; z_index < Z.len(); z_index++) Z[z_index] = 0;
304 }
305 
FromString(RWDigits Z,FromStringAccumulator * accumulator)306 void ProcessorImpl::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
307   if (accumulator->inline_everything_) {
308     int i = 0;
309     for (; i < accumulator->stack_parts_used_; i++) {
310       Z[i] = accumulator->stack_parts_[i];
311     }
312     for (; i < Z.len(); i++) Z[i] = 0;
313   } else if (accumulator->stack_parts_used_ == 0) {
314     for (int i = 0; i < Z.len(); i++) Z[i] = 0;
315   } else if (IsPowerOfTwo(accumulator->radix_)) {
316     FromStringBasePowerOfTwo(Z, accumulator);
317   } else if (accumulator->ResultLength() < kFromStringLargeThreshold) {
318     FromStringClassic(Z, accumulator);
319   } else {
320     FromStringLarge(Z, accumulator);
321   }
322 }
323 
FromString(RWDigits Z,FromStringAccumulator * accumulator)324 Status Processor::FromString(RWDigits Z, FromStringAccumulator* accumulator) {
325   ProcessorImpl* impl = static_cast<ProcessorImpl*>(this);
326   impl->FromString(Z, accumulator);
327   return impl->get_and_clear_status();
328 }
329 
330 }  // namespace bigint
331 }  // namespace v8
332