1 //! Rust adaptation of the Grisu3 algorithm described in "Printing Floating-Point Numbers Quickly
2 //! and Accurately with Integers"[^1]. It uses about 1KB of precomputed table, and in turn, it's
3 //! very quick for most inputs.
4 //!
5 //! [^1]: Florian Loitsch. 2010. Printing floating-point numbers quickly and
6 //!   accurately with integers. SIGPLAN Not. 45, 6 (June 2010), 233-243.
7 
8 use crate::mem::MaybeUninit;
9 use crate::num::diy_float::Fp;
10 use crate::num::flt2dec::{round_up, Decoded, MAX_SIG_DIGITS};
11 
12 // see the comments in `format_shortest_opt` for the rationale.
13 #[doc(hidden)]
14 pub const ALPHA: i16 = -60;
15 #[doc(hidden)]
16 pub const GAMMA: i16 = -32;
17 
18 /*
19 # the following Python code generates this table:
20 for i in xrange(-308, 333, 8):
21     if i >= 0: f = 10**i; e = 0
22     else: f = 2**(80-4*i) // 10**-i; e = 4 * i - 80
23     l = f.bit_length()
24     f = ((f << 64 >> (l-1)) + 1) >> 1; e += l - 64
25     print '    (%#018x, %5d, %4d),' % (f, e, i)
26 */
27 
28 #[doc(hidden)]
29 pub static CACHED_POW10: [(u64, i16, i16); 81] = [
30     // (f, e, k)
31     (0xe61acf033d1a45df, -1087, -308),
32     (0xab70fe17c79ac6ca, -1060, -300),
33     (0xff77b1fcbebcdc4f, -1034, -292),
34     (0xbe5691ef416bd60c, -1007, -284),
35     (0x8dd01fad907ffc3c, -980, -276),
36     (0xd3515c2831559a83, -954, -268),
37     (0x9d71ac8fada6c9b5, -927, -260),
38     (0xea9c227723ee8bcb, -901, -252),
39     (0xaecc49914078536d, -874, -244),
40     (0x823c12795db6ce57, -847, -236),
41     (0xc21094364dfb5637, -821, -228),
42     (0x9096ea6f3848984f, -794, -220),
43     (0xd77485cb25823ac7, -768, -212),
44     (0xa086cfcd97bf97f4, -741, -204),
45     (0xef340a98172aace5, -715, -196),
46     (0xb23867fb2a35b28e, -688, -188),
47     (0x84c8d4dfd2c63f3b, -661, -180),
48     (0xc5dd44271ad3cdba, -635, -172),
49     (0x936b9fcebb25c996, -608, -164),
50     (0xdbac6c247d62a584, -582, -156),
51     (0xa3ab66580d5fdaf6, -555, -148),
52     (0xf3e2f893dec3f126, -529, -140),
53     (0xb5b5ada8aaff80b8, -502, -132),
54     (0x87625f056c7c4a8b, -475, -124),
55     (0xc9bcff6034c13053, -449, -116),
56     (0x964e858c91ba2655, -422, -108),
57     (0xdff9772470297ebd, -396, -100),
58     (0xa6dfbd9fb8e5b88f, -369, -92),
59     (0xf8a95fcf88747d94, -343, -84),
60     (0xb94470938fa89bcf, -316, -76),
61     (0x8a08f0f8bf0f156b, -289, -68),
62     (0xcdb02555653131b6, -263, -60),
63     (0x993fe2c6d07b7fac, -236, -52),
64     (0xe45c10c42a2b3b06, -210, -44),
65     (0xaa242499697392d3, -183, -36),
66     (0xfd87b5f28300ca0e, -157, -28),
67     (0xbce5086492111aeb, -130, -20),
68     (0x8cbccc096f5088cc, -103, -12),
69     (0xd1b71758e219652c, -77, -4),
70     (0x9c40000000000000, -50, 4),
71     (0xe8d4a51000000000, -24, 12),
72     (0xad78ebc5ac620000, 3, 20),
73     (0x813f3978f8940984, 30, 28),
74     (0xc097ce7bc90715b3, 56, 36),
75     (0x8f7e32ce7bea5c70, 83, 44),
76     (0xd5d238a4abe98068, 109, 52),
77     (0x9f4f2726179a2245, 136, 60),
78     (0xed63a231d4c4fb27, 162, 68),
79     (0xb0de65388cc8ada8, 189, 76),
80     (0x83c7088e1aab65db, 216, 84),
81     (0xc45d1df942711d9a, 242, 92),
82     (0x924d692ca61be758, 269, 100),
83     (0xda01ee641a708dea, 295, 108),
84     (0xa26da3999aef774a, 322, 116),
85     (0xf209787bb47d6b85, 348, 124),
86     (0xb454e4a179dd1877, 375, 132),
87     (0x865b86925b9bc5c2, 402, 140),
88     (0xc83553c5c8965d3d, 428, 148),
89     (0x952ab45cfa97a0b3, 455, 156),
90     (0xde469fbd99a05fe3, 481, 164),
91     (0xa59bc234db398c25, 508, 172),
92     (0xf6c69a72a3989f5c, 534, 180),
93     (0xb7dcbf5354e9bece, 561, 188),
94     (0x88fcf317f22241e2, 588, 196),
95     (0xcc20ce9bd35c78a5, 614, 204),
96     (0x98165af37b2153df, 641, 212),
97     (0xe2a0b5dc971f303a, 667, 220),
98     (0xa8d9d1535ce3b396, 694, 228),
99     (0xfb9b7cd9a4a7443c, 720, 236),
100     (0xbb764c4ca7a44410, 747, 244),
101     (0x8bab8eefb6409c1a, 774, 252),
102     (0xd01fef10a657842c, 800, 260),
103     (0x9b10a4e5e9913129, 827, 268),
104     (0xe7109bfba19c0c9d, 853, 276),
105     (0xac2820d9623bf429, 880, 284),
106     (0x80444b5e7aa7cf85, 907, 292),
107     (0xbf21e44003acdd2d, 933, 300),
108     (0x8e679c2f5e44ff8f, 960, 308),
109     (0xd433179d9c8cb841, 986, 316),
110     (0x9e19db92b4e31ba9, 1013, 324),
111     (0xeb96bf6ebadf77d9, 1039, 332),
112 ];
113 
114 #[doc(hidden)]
115 pub const CACHED_POW10_FIRST_E: i16 = -1087;
116 #[doc(hidden)]
117 pub const CACHED_POW10_LAST_E: i16 = 1039;
118 
119 #[doc(hidden)]
cached_power(alpha: i16, gamma: i16) -> (i16, Fp)120 pub fn cached_power(alpha: i16, gamma: i16) -> (i16, Fp) {
121     let offset = CACHED_POW10_FIRST_E as i32;
122     let range = (CACHED_POW10.len() as i32) - 1;
123     let domain = (CACHED_POW10_LAST_E - CACHED_POW10_FIRST_E) as i32;
124     let idx = ((gamma as i32) - offset) * range / domain;
125     let (f, e, k) = CACHED_POW10[idx as usize];
126     debug_assert!(alpha <= e && e <= gamma);
127     (k, Fp { f, e })
128 }
129 
130 /// Given `x > 0`, returns `(k, 10^k)` such that `10^k <= x < 10^(k+1)`.
131 #[doc(hidden)]
max_pow10_no_more_than(x: u32) -> (u8, u32)132 pub fn max_pow10_no_more_than(x: u32) -> (u8, u32) {
133     debug_assert!(x > 0);
134 
135     const X9: u32 = 10_0000_0000;
136     const X8: u32 = 1_0000_0000;
137     const X7: u32 = 1000_0000;
138     const X6: u32 = 100_0000;
139     const X5: u32 = 10_0000;
140     const X4: u32 = 1_0000;
141     const X3: u32 = 1000;
142     const X2: u32 = 100;
143     const X1: u32 = 10;
144 
145     if x < X4 {
146         if x < X2 {
147             if x < X1 { (0, 1) } else { (1, X1) }
148         } else {
149             if x < X3 { (2, X2) } else { (3, X3) }
150         }
151     } else {
152         if x < X6 {
153             if x < X5 { (4, X4) } else { (5, X5) }
154         } else if x < X8 {
155             if x < X7 { (6, X6) } else { (7, X7) }
156         } else {
157             if x < X9 { (8, X8) } else { (9, X9) }
158         }
159     }
160 }
161 
162 /// The shortest mode implementation for Grisu.
163 ///
164 /// It returns `None` when it would return an inexact representation otherwise.
format_shortest_opt<'a>( d: &Decoded, buf: &'a mut [MaybeUninit<u8>], ) -> Option<( &'a [u8], i16)>165 pub fn format_shortest_opt<'a>(
166     d: &Decoded,
167     buf: &'a mut [MaybeUninit<u8>],
168 ) -> Option<(/*digits*/ &'a [u8], /*exp*/ i16)> {
169     assert!(d.mant > 0);
170     assert!(d.minus > 0);
171     assert!(d.plus > 0);
172     assert!(d.mant.checked_add(d.plus).is_some());
173     assert!(d.mant.checked_sub(d.minus).is_some());
174     assert!(buf.len() >= MAX_SIG_DIGITS);
175     assert!(d.mant + d.plus < (1 << 61)); // we need at least three bits of additional precision
176 
177     // start with the normalized values with the shared exponent
178     let plus = Fp { f: d.mant + d.plus, e: d.exp }.normalize();
179     let minus = Fp { f: d.mant - d.minus, e: d.exp }.normalize_to(plus.e);
180     let v = Fp { f: d.mant, e: d.exp }.normalize_to(plus.e);
181 
182     // find any `cached = 10^minusk` such that `ALPHA <= minusk + plus.e + 64 <= GAMMA`.
183     // since `plus` is normalized, this means `2^(62 + ALPHA) <= plus * cached < 2^(64 + GAMMA)`;
184     // given our choices of `ALPHA` and `GAMMA`, this puts `plus * cached` into `[4, 2^32)`.
185     //
186     // it is obviously desirable to maximize `GAMMA - ALPHA`,
187     // so that we don't need many cached powers of 10, but there are some considerations:
188     //
189     // 1. we want to keep `floor(plus * cached)` within `u32` since it needs a costly division.
190     //    (this is not really avoidable, remainder is required for accuracy estimation.)
191     // 2. the remainder of `floor(plus * cached)` repeatedly gets multiplied by 10,
192     //    and it should not overflow.
193     //
194     // the first gives `64 + GAMMA <= 32`, while the second gives `10 * 2^-ALPHA <= 2^64`;
195     // -60 and -32 is the maximal range with this constraint, and V8 also uses them.
196     let (minusk, cached) = cached_power(ALPHA - plus.e - 64, GAMMA - plus.e - 64);
197 
198     // scale fps. this gives the maximal error of 1 ulp (proved from Theorem 5.1).
199     let plus = plus.mul(&cached);
200     let minus = minus.mul(&cached);
201     let v = v.mul(&cached);
202     debug_assert_eq!(plus.e, minus.e);
203     debug_assert_eq!(plus.e, v.e);
204 
205     //         +- actual range of minus
206     //   | <---|---------------------- unsafe region --------------------------> |
207     //   |     |                                                                 |
208     //   |  |<--->|  | <--------------- safe region ---------------> |           |
209     //   |  |     |  |                                               |           |
210     //   |1 ulp|1 ulp|                 |1 ulp|1 ulp|                 |1 ulp|1 ulp|
211     //   |<--->|<--->|                 |<--->|<--->|                 |<--->|<--->|
212     //   |-----|-----|-------...-------|-----|-----|-------...-------|-----|-----|
213     //   |   minus   |                 |     v     |                 |   plus    |
214     // minus1     minus0           v - 1 ulp   v + 1 ulp           plus0       plus1
215     //
216     // above `minus`, `v` and `plus` are *quantized* approximations (error < 1 ulp).
217     // as we don't know the error is positive or negative, we use two approximations spaced equally
218     // and have the maximal error of 2 ulps.
219     //
220     // the "unsafe region" is a liberal interval which we initially generate.
221     // the "safe region" is a conservative interval which we only accept.
222     // we start with the correct repr within the unsafe region, and try to find the closest repr
223     // to `v` which is also within the safe region. if we can't, we give up.
224     let plus1 = plus.f + 1;
225     //  let plus0 = plus.f - 1; // only for explanation
226     //  let minus0 = minus.f + 1; // only for explanation
227     let minus1 = minus.f - 1;
228     let e = -plus.e as usize; // shared exponent
229 
230     // divide `plus1` into integral and fractional parts.
231     // integral parts are guaranteed to fit in u32, since cached power guarantees `plus < 2^32`
232     // and normalized `plus.f` is always less than `2^64 - 2^4` due to the precision requirement.
233     let plus1int = (plus1 >> e) as u32;
234     let plus1frac = plus1 & ((1 << e) - 1);
235 
236     // calculate the largest `10^max_kappa` no more than `plus1` (thus `plus1 < 10^(max_kappa+1)`).
237     // this is an upper bound of `kappa` below.
238     let (max_kappa, max_ten_kappa) = max_pow10_no_more_than(plus1int);
239 
240     let mut i = 0;
241     let exp = max_kappa as i16 - minusk + 1;
242 
243     // Theorem 6.2: if `k` is the greatest integer s.t. `0 <= y mod 10^k <= y - x`,
244     //              then `V = floor(y / 10^k) * 10^k` is in `[x, y]` and one of the shortest
245     //              representations (with the minimal number of significant digits) in that range.
246     //
247     // find the digit length `kappa` between `(minus1, plus1)` as per Theorem 6.2.
248     // Theorem 6.2 can be adopted to exclude `x` by requiring `y mod 10^k < y - x` instead.
249     // (e.g., `x` = 32000, `y` = 32777; `kappa` = 2 since `y mod 10^3 = 777 < y - x = 777`.)
250     // the algorithm relies on the later verification phase to exclude `y`.
251     let delta1 = plus1 - minus1;
252     //  let delta1int = (delta1 >> e) as usize; // only for explanation
253     let delta1frac = delta1 & ((1 << e) - 1);
254 
255     // render integral parts, while checking for the accuracy at each step.
256     let mut kappa = max_kappa as i16;
257     let mut ten_kappa = max_ten_kappa; // 10^kappa
258     let mut remainder = plus1int; // digits yet to be rendered
259     loop {
260         // we always have at least one digit to render, as `plus1 >= 10^kappa`
261         // invariants:
262         // - `delta1int <= remainder < 10^(kappa+1)`
263         // - `plus1int = d[0..n-1] * 10^(kappa+1) + remainder`
264         //   (it follows that `remainder = plus1int % 10^(kappa+1)`)
265 
266         // divide `remainder` by `10^kappa`. both are scaled by `2^-e`.
267         let q = remainder / ten_kappa;
268         let r = remainder % ten_kappa;
269         debug_assert!(q < 10);
270         buf[i] = MaybeUninit::new(b'0' + q as u8);
271         i += 1;
272 
273         let plus1rem = ((r as u64) << e) + plus1frac; // == (plus1 % 10^kappa) * 2^e
274         if plus1rem < delta1 {
275             // `plus1 % 10^kappa < delta1 = plus1 - minus1`; we've found the correct `kappa`.
276             let ten_kappa = (ten_kappa as u64) << e; // scale 10^kappa back to the shared exponent
277             return round_and_weed(
278                 // SAFETY: we initialized that memory above.
279                 unsafe { MaybeUninit::slice_assume_init_mut(&mut buf[..i]) },
280                 exp,
281                 plus1rem,
282                 delta1,
283                 plus1 - v.f,
284                 ten_kappa,
285                 1,
286             );
287         }
288 
289         // break the loop when we have rendered all integral digits.
290         // the exact number of digits is `max_kappa + 1` as `plus1 < 10^(max_kappa+1)`.
291         if i > max_kappa as usize {
292             debug_assert_eq!(ten_kappa, 1);
293             debug_assert_eq!(kappa, 0);
294             break;
295         }
296 
297         // restore invariants
298         kappa -= 1;
299         ten_kappa /= 10;
300         remainder = r;
301     }
302 
303     // render fractional parts, while checking for the accuracy at each step.
304     // this time we rely on repeated multiplications, as division will lose the precision.
305     let mut remainder = plus1frac;
306     let mut threshold = delta1frac;
307     let mut ulp = 1;
308     loop {
309         // the next digit should be significant as we've tested that before breaking out
310         // invariants, where `m = max_kappa + 1` (# of digits in the integral part):
311         // - `remainder < 2^e`
312         // - `plus1frac * 10^(n-m) = d[m..n-1] * 2^e + remainder`
313 
314         remainder *= 10; // won't overflow, `2^e * 10 < 2^64`
315         threshold *= 10;
316         ulp *= 10;
317 
318         // divide `remainder` by `10^kappa`.
319         // both are scaled by `2^e / 10^kappa`, so the latter is implicit here.
320         let q = remainder >> e;
321         let r = remainder & ((1 << e) - 1);
322         debug_assert!(q < 10);
323         buf[i] = MaybeUninit::new(b'0' + q as u8);
324         i += 1;
325 
326         if r < threshold {
327             let ten_kappa = 1 << e; // implicit divisor
328             return round_and_weed(
329                 // SAFETY: we initialized that memory above.
330                 unsafe { MaybeUninit::slice_assume_init_mut(&mut buf[..i]) },
331                 exp,
332                 r,
333                 threshold,
334                 (plus1 - v.f) * ulp,
335                 ten_kappa,
336                 ulp,
337             );
338         }
339 
340         // restore invariants
341         kappa -= 1;
342         remainder = r;
343     }
344 
345     // we've generated all significant digits of `plus1`, but not sure if it's the optimal one.
346     // for example, if `minus1` is 3.14153... and `plus1` is 3.14158..., there are 5 different
347     // shortest representation from 3.14154 to 3.14158 but we only have the greatest one.
348     // we have to successively decrease the last digit and check if this is the optimal repr.
349     // there are at most 9 candidates (..1 to ..9), so this is fairly quick. ("rounding" phase)
350     //
351     // the function checks if this "optimal" repr is actually within the ulp ranges,
352     // and also, it is possible that the "second-to-optimal" repr can actually be optimal
353     // due to the rounding error. in either cases this returns `None`. ("weeding" phase)
354     //
355     // all arguments here are scaled by the common (but implicit) value `k`, so that:
356     // - `remainder = (plus1 % 10^kappa) * k`
357     // - `threshold = (plus1 - minus1) * k` (and also, `remainder < threshold`)
358     // - `plus1v = (plus1 - v) * k` (and also, `threshold > plus1v` from prior invariants)
359     // - `ten_kappa = 10^kappa * k`
360     // - `ulp = 2^-e * k`
361     fn round_and_weed(
362         buf: &mut [u8],
363         exp: i16,
364         remainder: u64,
365         threshold: u64,
366         plus1v: u64,
367         ten_kappa: u64,
368         ulp: u64,
369     ) -> Option<(&[u8], i16)> {
370         assert!(!buf.is_empty());
371 
372         // produce two approximations to `v` (actually `plus1 - v`) within 1.5 ulps.
373         // the resulting representation should be the closest representation to both.
374         //
375         // here `plus1 - v` is used since calculations are done with respect to `plus1`
376         // in order to avoid overflow/underflow (hence the seemingly swapped names).
377         let plus1v_down = plus1v + ulp; // plus1 - (v - 1 ulp)
378         let plus1v_up = plus1v - ulp; // plus1 - (v + 1 ulp)
379 
380         // decrease the last digit and stop at the closest representation to `v + 1 ulp`.
381         let mut plus1w = remainder; // plus1w(n) = plus1 - w(n)
382         {
383             let last = buf.last_mut().unwrap();
384 
385             // we work with the approximated digits `w(n)`, which is initially equal to `plus1 -
386             // plus1 % 10^kappa`. after running the loop body `n` times, `w(n) = plus1 -
387             // plus1 % 10^kappa - n * 10^kappa`. we set `plus1w(n) = plus1 - w(n) =
388             // plus1 % 10^kappa + n * 10^kappa` (thus `remainder = plus1w(0)`) to simplify checks.
389             // note that `plus1w(n)` is always increasing.
390             //
391             // we have three conditions to terminate. any of them will make the loop unable to
392             // proceed, but we then have at least one valid representation known to be closest to
393             // `v + 1 ulp` anyway. we will denote them as TC1 through TC3 for brevity.
394             //
395             // TC1: `w(n) <= v + 1 ulp`, i.e., this is the last repr that can be the closest one.
396             // this is equivalent to `plus1 - w(n) = plus1w(n) >= plus1 - (v + 1 ulp) = plus1v_up`.
397             // combined with TC2 (which checks if `w(n+1)` is valid), this prevents the possible
398             // overflow on the calculation of `plus1w(n)`.
399             //
400             // TC2: `w(n+1) < minus1`, i.e., the next repr definitely does not round to `v`.
401             // this is equivalent to `plus1 - w(n) + 10^kappa = plus1w(n) + 10^kappa >
402             // plus1 - minus1 = threshold`. the left hand side can overflow, but we know
403             // `threshold > plus1v`, so if TC1 is false, `threshold - plus1w(n) >
404             // threshold - (plus1v - 1 ulp) > 1 ulp` and we can safely test if
405             // `threshold - plus1w(n) < 10^kappa` instead.
406             //
407             // TC3: `abs(w(n) - (v + 1 ulp)) <= abs(w(n+1) - (v + 1 ulp))`, i.e., the next repr is
408             // no closer to `v + 1 ulp` than the current repr. given `z(n) = plus1v_up - plus1w(n)`,
409             // this becomes `abs(z(n)) <= abs(z(n+1))`. again assuming that TC1 is false, we have
410             // `z(n) > 0`. we have two cases to consider:
411             //
412             // - when `z(n+1) >= 0`: TC3 becomes `z(n) <= z(n+1)`. as `plus1w(n)` is increasing,
413             //   `z(n)` should be decreasing and this is clearly false.
414             // - when `z(n+1) < 0`:
415             //   - TC3a: the precondition is `plus1v_up < plus1w(n) + 10^kappa`. assuming TC2 is
416             //     false, `threshold >= plus1w(n) + 10^kappa` so it cannot overflow.
417             //   - TC3b: TC3 becomes `z(n) <= -z(n+1)`, i.e., `plus1v_up - plus1w(n) >=
418             //     plus1w(n+1) - plus1v_up = plus1w(n) + 10^kappa - plus1v_up`. the negated TC1
419             //     gives `plus1v_up > plus1w(n)`, so it cannot overflow or underflow when
420             //     combined with TC3a.
421             //
422             // consequently, we should stop when `TC1 || TC2 || (TC3a && TC3b)`. the following is
423             // equal to its inverse, `!TC1 && !TC2 && (!TC3a || !TC3b)`.
424             while plus1w < plus1v_up
425                 && threshold - plus1w >= ten_kappa
426                 && (plus1w + ten_kappa < plus1v_up
427                     || plus1v_up - plus1w >= plus1w + ten_kappa - plus1v_up)
428             {
429                 *last -= 1;
430                 debug_assert!(*last > b'0'); // the shortest repr cannot end with `0`
431                 plus1w += ten_kappa;
432             }
433         }
434 
435         // check if this representation is also the closest representation to `v - 1 ulp`.
436         //
437         // this is simply same to the terminating conditions for `v + 1 ulp`, with all `plus1v_up`
438         // replaced by `plus1v_down` instead. overflow analysis equally holds.
439         if plus1w < plus1v_down
440             && threshold - plus1w >= ten_kappa
441             && (plus1w + ten_kappa < plus1v_down
442                 || plus1v_down - plus1w >= plus1w + ten_kappa - plus1v_down)
443         {
444             return None;
445         }
446 
447         // now we have the closest representation to `v` between `plus1` and `minus1`.
448         // this is too liberal, though, so we reject any `w(n)` not between `plus0` and `minus0`,
449         // i.e., `plus1 - plus1w(n) <= minus0` or `plus1 - plus1w(n) >= plus0`. we utilize the facts
450         // that `threshold = plus1 - minus1` and `plus1 - plus0 = minus0 - minus1 = 2 ulp`.
451         if 2 * ulp <= plus1w && plus1w <= threshold - 4 * ulp { Some((buf, exp)) } else { None }
452     }
453 }
454 
455 /// The shortest mode implementation for Grisu with Dragon fallback.
456 ///
457 /// This should be used for most cases.
format_shortest<'a>( d: &Decoded, buf: &'a mut [MaybeUninit<u8>], ) -> ( &'a [u8], i16)458 pub fn format_shortest<'a>(
459     d: &Decoded,
460     buf: &'a mut [MaybeUninit<u8>],
461 ) -> (/*digits*/ &'a [u8], /*exp*/ i16) {
462     use crate::num::flt2dec::strategy::dragon::format_shortest as fallback;
463     // SAFETY: The borrow checker is not smart enough to let us use `buf`
464     // in the second branch, so we launder the lifetime here. But we only re-use
465     // `buf` if `format_shortest_opt` returned `None` so this is okay.
466     match format_shortest_opt(d, unsafe { &mut *(buf as *mut _) }) {
467         Some(ret) => ret,
468         None => fallback(d, buf),
469     }
470 }
471 
472 /// The exact and fixed mode implementation for Grisu.
473 ///
474 /// It returns `None` when it would return an inexact representation otherwise.
format_exact_opt<'a>( d: &Decoded, buf: &'a mut [MaybeUninit<u8>], limit: i16, ) -> Option<( &'a [u8], i16)>475 pub fn format_exact_opt<'a>(
476     d: &Decoded,
477     buf: &'a mut [MaybeUninit<u8>],
478     limit: i16,
479 ) -> Option<(/*digits*/ &'a [u8], /*exp*/ i16)> {
480     assert!(d.mant > 0);
481     assert!(d.mant < (1 << 61)); // we need at least three bits of additional precision
482     assert!(!buf.is_empty());
483 
484     // normalize and scale `v`.
485     let v = Fp { f: d.mant, e: d.exp }.normalize();
486     let (minusk, cached) = cached_power(ALPHA - v.e - 64, GAMMA - v.e - 64);
487     let v = v.mul(&cached);
488 
489     // divide `v` into integral and fractional parts.
490     let e = -v.e as usize;
491     let vint = (v.f >> e) as u32;
492     let vfrac = v.f & ((1 << e) - 1);
493 
494     // both old `v` and new `v` (scaled by `10^-k`) has an error of < 1 ulp (Theorem 5.1).
495     // as we don't know the error is positive or negative, we use two approximations
496     // spaced equally and have the maximal error of 2 ulps (same to the shortest case).
497     //
498     // the goal is to find the exactly rounded series of digits that are common to
499     // both `v - 1 ulp` and `v + 1 ulp`, so that we are maximally confident.
500     // if this is not possible, we don't know which one is the correct output for `v`,
501     // so we give up and fall back.
502     //
503     // `err` is defined as `1 ulp * 2^e` here (same to the ulp in `vfrac`),
504     // and we will scale it whenever `v` gets scaled.
505     let mut err = 1;
506 
507     // calculate the largest `10^max_kappa` no more than `v` (thus `v < 10^(max_kappa+1)`).
508     // this is an upper bound of `kappa` below.
509     let (max_kappa, max_ten_kappa) = max_pow10_no_more_than(vint);
510 
511     let mut i = 0;
512     let exp = max_kappa as i16 - minusk + 1;
513 
514     // if we are working with the last-digit limitation, we need to shorten the buffer
515     // before the actual rendering in order to avoid double rounding.
516     // note that we have to enlarge the buffer again when rounding up happens!
517     let len = if exp <= limit {
518         // oops, we cannot even produce *one* digit.
519         // this is possible when, say, we've got something like 9.5 and it's being rounded to 10.
520         //
521         // in principle we can immediately call `possibly_round` with an empty buffer,
522         // but scaling `max_ten_kappa << e` by 10 can result in overflow.
523         // thus we are being sloppy here and widen the error range by a factor of 10.
524         // this will increase the false negative rate, but only very, *very* slightly;
525         // it can only matter noticeably when the mantissa is bigger than 60 bits.
526         //
527         // SAFETY: `len=0`, so the obligation of having initialized this memory is trivial.
528         return unsafe {
529             possibly_round(buf, 0, exp, limit, v.f / 10, (max_ten_kappa as u64) << e, err << e)
530         };
531     } else if ((exp as i32 - limit as i32) as usize) < buf.len() {
532         (exp - limit) as usize
533     } else {
534         buf.len()
535     };
536     debug_assert!(len > 0);
537 
538     // render integral parts.
539     // the error is entirely fractional, so we don't need to check it in this part.
540     let mut kappa = max_kappa as i16;
541     let mut ten_kappa = max_ten_kappa; // 10^kappa
542     let mut remainder = vint; // digits yet to be rendered
543     loop {
544         // we always have at least one digit to render
545         // invariants:
546         // - `remainder < 10^(kappa+1)`
547         // - `vint = d[0..n-1] * 10^(kappa+1) + remainder`
548         //   (it follows that `remainder = vint % 10^(kappa+1)`)
549 
550         // divide `remainder` by `10^kappa`. both are scaled by `2^-e`.
551         let q = remainder / ten_kappa;
552         let r = remainder % ten_kappa;
553         debug_assert!(q < 10);
554         buf[i] = MaybeUninit::new(b'0' + q as u8);
555         i += 1;
556 
557         // is the buffer full? run the rounding pass with the remainder.
558         if i == len {
559             let vrem = ((r as u64) << e) + vfrac; // == (v % 10^kappa) * 2^e
560             // SAFETY: we have initialized `len` many bytes.
561             return unsafe {
562                 possibly_round(buf, len, exp, limit, vrem, (ten_kappa as u64) << e, err << e)
563             };
564         }
565 
566         // break the loop when we have rendered all integral digits.
567         // the exact number of digits is `max_kappa + 1` as `plus1 < 10^(max_kappa+1)`.
568         if i > max_kappa as usize {
569             debug_assert_eq!(ten_kappa, 1);
570             debug_assert_eq!(kappa, 0);
571             break;
572         }
573 
574         // restore invariants
575         kappa -= 1;
576         ten_kappa /= 10;
577         remainder = r;
578     }
579 
580     // render fractional parts.
581     //
582     // in principle we can continue to the last available digit and check for the accuracy.
583     // unfortunately we are working with the finite-sized integers, so we need some criterion
584     // to detect the overflow. V8 uses `remainder > err`, which becomes false when
585     // the first `i` significant digits of `v - 1 ulp` and `v` differ. however this rejects
586     // too many otherwise valid input.
587     //
588     // since the later phase has a correct overflow detection, we instead use tighter criterion:
589     // we continue til `err` exceeds `10^kappa / 2`, so that the range between `v - 1 ulp` and
590     // `v + 1 ulp` definitely contains two or more rounded representations. this is same to
591     // the first two comparisons from `possibly_round`, for the reference.
592     let mut remainder = vfrac;
593     let maxerr = 1 << (e - 1);
594     while err < maxerr {
595         // invariants, where `m = max_kappa + 1` (# of digits in the integral part):
596         // - `remainder < 2^e`
597         // - `vfrac * 10^(n-m) = d[m..n-1] * 2^e + remainder`
598         // - `err = 10^(n-m)`
599 
600         remainder *= 10; // won't overflow, `2^e * 10 < 2^64`
601         err *= 10; // won't overflow, `err * 10 < 2^e * 5 < 2^64`
602 
603         // divide `remainder` by `10^kappa`.
604         // both are scaled by `2^e / 10^kappa`, so the latter is implicit here.
605         let q = remainder >> e;
606         let r = remainder & ((1 << e) - 1);
607         debug_assert!(q < 10);
608         buf[i] = MaybeUninit::new(b'0' + q as u8);
609         i += 1;
610 
611         // is the buffer full? run the rounding pass with the remainder.
612         if i == len {
613             // SAFETY: we have initialized `len` many bytes.
614             return unsafe { possibly_round(buf, len, exp, limit, r, 1 << e, err) };
615         }
616 
617         // restore invariants
618         remainder = r;
619     }
620 
621     // further calculation is useless (`possibly_round` definitely fails), so we give up.
622     return None;
623 
624     // we've generated all requested digits of `v`, which should be also same to corresponding
625     // digits of `v - 1 ulp`. now we check if there is a unique representation shared by
626     // both `v - 1 ulp` and `v + 1 ulp`; this can be either same to generated digits, or
627     // to the rounded-up version of those digits. if the range contains multiple representations
628     // of the same length, we cannot be sure and should return `None` instead.
629     //
630     // all arguments here are scaled by the common (but implicit) value `k`, so that:
631     // - `remainder = (v % 10^kappa) * k`
632     // - `ten_kappa = 10^kappa * k`
633     // - `ulp = 2^-e * k`
634     //
635     // SAFETY: the first `len` bytes of `buf` must be initialized.
636     unsafe fn possibly_round(
637         buf: &mut [MaybeUninit<u8>],
638         mut len: usize,
639         mut exp: i16,
640         limit: i16,
641         remainder: u64,
642         ten_kappa: u64,
643         ulp: u64,
644     ) -> Option<(&[u8], i16)> {
645         debug_assert!(remainder < ten_kappa);
646 
647         //           10^kappa
648         //    :   :   :<->:   :
649         //    :   :   :   :   :
650         //    :|1 ulp|1 ulp|  :
651         //    :|<--->|<--->|  :
652         // ----|-----|-----|----
653         //     |     v     |
654         // v - 1 ulp   v + 1 ulp
655         //
656         // (for the reference, the dotted line indicates the exact value for
657         // possible representations in given number of digits.)
658         //
659         // error is too large that there are at least three possible representations
660         // between `v - 1 ulp` and `v + 1 ulp`. we cannot determine which one is correct.
661         if ulp >= ten_kappa {
662             return None;
663         }
664 
665         //    10^kappa
666         //   :<------->:
667         //   :         :
668         //   : |1 ulp|1 ulp|
669         //   : |<--->|<--->|
670         // ----|-----|-----|----
671         //     |     v     |
672         // v - 1 ulp   v + 1 ulp
673         //
674         // in fact, 1/2 ulp is enough to introduce two possible representations.
675         // (remember that we need a unique representation for both `v - 1 ulp` and `v + 1 ulp`.)
676         // this won't overflow, as `ulp < ten_kappa` from the first check.
677         if ten_kappa - ulp <= ulp {
678             return None;
679         }
680 
681         //     remainder
682         //       :<->|                           :
683         //       :   |                           :
684         //       :<--------- 10^kappa ---------->:
685         //     | :   |                           :
686         //     |1 ulp|1 ulp|                     :
687         //     |<--->|<--->|                     :
688         // ----|-----|-----|------------------------
689         //     |     v     |
690         // v - 1 ulp   v + 1 ulp
691         //
692         // if `v + 1 ulp` is closer to the rounded-down representation (which is already in `buf`),
693         // then we can safely return. note that `v - 1 ulp` *can* be less than the current
694         // representation, but as `1 ulp < 10^kappa / 2`, this condition is enough:
695         // the distance between `v - 1 ulp` and the current representation
696         // cannot exceed `10^kappa / 2`.
697         //
698         // the condition equals to `remainder + ulp < 10^kappa / 2`.
699         // since this can easily overflow, first check if `remainder < 10^kappa / 2`.
700         // we've already verified that `ulp < 10^kappa / 2`, so as long as
701         // `10^kappa` did not overflow after all, the second check is fine.
702         if ten_kappa - remainder > remainder && ten_kappa - 2 * remainder >= 2 * ulp {
703             // SAFETY: our caller initialized that memory.
704             return Some((unsafe { MaybeUninit::slice_assume_init_ref(&buf[..len]) }, exp));
705         }
706 
707         //   :<------- remainder ------>|   :
708         //   :                          |   :
709         //   :<--------- 10^kappa --------->:
710         //   :                    |     |   : |
711         //   :                    |1 ulp|1 ulp|
712         //   :                    |<--->|<--->|
713         // -----------------------|-----|-----|-----
714         //                        |     v     |
715         //                    v - 1 ulp   v + 1 ulp
716         //
717         // on the other hands, if `v - 1 ulp` is closer to the rounded-up representation,
718         // we should round up and return. for the same reason we don't need to check `v + 1 ulp`.
719         //
720         // the condition equals to `remainder - ulp >= 10^kappa / 2`.
721         // again we first check if `remainder > ulp` (note that this is not `remainder >= ulp`,
722         // as `10^kappa` is never zero). also note that `remainder - ulp <= 10^kappa`,
723         // so the second check does not overflow.
724         if remainder > ulp && ten_kappa - (remainder - ulp) <= remainder - ulp {
725             if let Some(c) =
726                 // SAFETY: our caller must have initialized that memory.
727                 round_up(unsafe { MaybeUninit::slice_assume_init_mut(&mut buf[..len]) })
728             {
729                 // only add an additional digit when we've been requested the fixed precision.
730                 // we also need to check that, if the original buffer was empty,
731                 // the additional digit can only be added when `exp == limit` (edge case).
732                 exp += 1;
733                 if exp > limit && len < buf.len() {
734                     buf[len] = MaybeUninit::new(c);
735                     len += 1;
736                 }
737             }
738             // SAFETY: we and our caller initialized that memory.
739             return Some((unsafe { MaybeUninit::slice_assume_init_ref(&buf[..len]) }, exp));
740         }
741 
742         // otherwise we are doomed (i.e., some values between `v - 1 ulp` and `v + 1 ulp` are
743         // rounding down and others are rounding up) and give up.
744         None
745     }
746 }
747 
748 /// The exact and fixed mode implementation for Grisu with Dragon fallback.
749 ///
750 /// This should be used for most cases.
format_exact<'a>( d: &Decoded, buf: &'a mut [MaybeUninit<u8>], limit: i16, ) -> ( &'a [u8], i16)751 pub fn format_exact<'a>(
752     d: &Decoded,
753     buf: &'a mut [MaybeUninit<u8>],
754     limit: i16,
755 ) -> (/*digits*/ &'a [u8], /*exp*/ i16) {
756     use crate::num::flt2dec::strategy::dragon::format_exact as fallback;
757     // SAFETY: The borrow checker is not smart enough to let us use `buf`
758     // in the second branch, so we launder the lifetime here. But we only re-use
759     // `buf` if `format_exact_opt` returned `None` so this is okay.
760     match format_exact_opt(d, unsafe { &mut *(buf as *mut _) }, limit) {
761         Some(ret) => ret,
762         None => fallback(d, buf, limit),
763     }
764 }
765