1 //! Internal implementation of the Grisu2 algorithm.
2 //!
3 //! The optimized routines are adapted from Andrea Samoljuk's `fpconv` library,
4 //! which is available [here](https://github.com/night-shift/fpconv).
5 //!
6 //! The following benchmarks were run on an "Intel(R) Core(TM) i7-6560U
7 //! CPU @ 2.20GHz" CPU, on Fedora 28, Linux kernel version 4.18.16-200
8 //! (x86-64), using the lexical formatter, `dtoa::write()` or `x.to_string()`,
9 //! avoiding any inefficiencies in Rust string parsing for `format!(...)`
10 //! or `write!()` macros. The code was compiled with LTO and at an optimization
11 //! level of 3.
12 //!
13 //! The benchmarks with `std` were compiled using "rustc 1.29.2 (17a9dc751
14 //! 2018-10-05", and the `no_std` benchmarks were compiled using "rustc
15 //! 1.31.0-nightly (46880f41b 2018-10-15)".
16 //!
17 //! The benchmark code may be found `benches/ftoa.rs`.
18 //!
19 //! # Benchmarks
20 //!
21 //! | Type  |  lexical (ns/iter) | to_string (ns/iter)   | Relative Increase |
22 //! |:-----:|:------------------:|:---------------------:|:-----------------:|
23 //! | f32   | 1,221,025          | 2,711,290             | 2.22x             |
24 //! | f64   | 1,248,397          | 3,558,305             | 2.85x             |
25 //!
26 //! # Raw Benchmarks
27 //!
28 //! ```text
29 //! test f32_dtoa      ... bench:   1,174,070 ns/iter (+/- 442,501)
30 //! test f32_lexical   ... bench:   1,433,234 ns/iter (+/- 633,261)
31 //! test f32_ryu       ... bench:     669,828 ns/iter (+/- 192,291)
32 //! test f32_to_string ... bench:   3,341,733 ns/iter (+/- 1,346,744)
33 //! test f64_dtoa      ... bench:   1,302,522 ns/iter (+/- 364,655)
34 //! test f64_lexical   ... bench:   1,375,384 ns/iter (+/- 596,860)
35 //! test f64_ryu       ... bench:   1,015,171 ns/iter (+/- 187,552)
36 //! test f64_to_string ... bench:   3,900,299 ns/iter (+/- 521,956)
37 //! ```
38 
39 // Code the generate the benchmark plot:
40 //  import numpy as np
41 //  import pandas as pd
42 //  import matplotlib.pyplot as plt
43 //  plt.style.use('ggplot')
44 //  lexical = np.array([1221025, 1375384]) / 1e6
45 //  dtoa = np.array([1174070, 1302522]) / 1e6
46 //  ryu = np.array([669828, 1015171]) / 1e6
47 //  rustcore = np.array([2711290, 3558305]) / 1e6
48 //  index = ["f32", "f64"]
49 //  df = pd.DataFrame({'lexical': lexical, 'lexical (dtoa)': dtoa, 'lexical (ryu)': ryu, 'rustcore': rustcore}, index = index, columns=['lexical', 'lexical (dtoa)', 'lexical (ryu)', 'rustcore'])
50 //  ax = df.plot.bar(rot=0, figsize=(16, 8), fontsize=14, color=['#E24A33', '#988ED5', '#8EBA42', '#348ABD'])
51 //  ax.set_ylabel("ms/iter")
52 //  ax.figure.tight_layout()
53 //  ax.legend(loc=2, prop={'size': 14})
54 //  plt.show()
55 
56 use crate::float::ExtendedFloat80;
57 use crate::util::*;
58 
59 // CACHED POWERS
60 
61 perftools_inline!{
62 /// Find cached power of 10 from the exponent.
63 fn cached_grisu_power(exp: i32, k: &mut i32)
64     -> &'static ExtendedFloat80
65 {
66     // FLOATING POINT CONSTANTS
67     const ONE_LOG_TEN: f64 = 0.30102999566398114;
68     const NPOWERS: i32 = 87;
69     const FIRSTPOWER: i32 = -348;       // 10 ^ -348
70     const STEPPOWERS: i32 = 8;
71     const EXPMAX: i32 = -32;
72     const EXPMIN: i32 = -60;
73 
74     let approx = -((exp + NPOWERS).as_f64()) * ONE_LOG_TEN;
75     let approx = approx.as_i32();
76     let mut idx = ((approx - FIRSTPOWER) / STEPPOWERS).as_usize();
77 
78     loop {
79         // Use `arr.get(idx)`, which explicitly provides a reference,
80         // instead of `arr[idx]`, which provides a value.
81         // We have a bug in versions <= 1.27.0 where it creates
82         // a local copy, which we then get the reference to and return.
83         // This allows use-after-free, without any warning, so we're
84         // using unidiomatic code to avoid any issue.
85         let power = GRISU_POWERS_OF_TEN.get(idx).unwrap();
86         let current = exp + power.exp + 64;
87         if current < EXPMIN {
88             idx += 1;
89             continue;
90         }
91 
92         if current > EXPMAX {
93             idx -= 1;
94             continue;
95         }
96 
97         *k = FIRSTPOWER + idx.as_i32() * STEPPOWERS;
98         return power;
99     }
100 }}
101 
102 /// Cached powers of ten as specified by the Grisu algorithm.
103 ///
104 /// Cached powers of 10^k, calculated as if by:
105 /// `ceil((alpha-e+63) * ONE_LOG_TEN);`
106 const GRISU_POWERS_OF_TEN: [ExtendedFloat80; 87] = [
107     ExtendedFloat80 { mant: 18054884314459144840, exp: -1220 },
108     ExtendedFloat80 { mant: 13451937075301367670, exp: -1193 },
109     ExtendedFloat80 { mant: 10022474136428063862, exp: -1166 },
110     ExtendedFloat80 { mant: 14934650266808366570, exp: -1140 },
111     ExtendedFloat80 { mant: 11127181549972568877, exp: -1113 },
112     ExtendedFloat80 { mant: 16580792590934885855, exp: -1087 },
113     ExtendedFloat80 { mant: 12353653155963782858, exp: -1060 },
114     ExtendedFloat80 { mant: 18408377700990114895, exp: -1034 },
115     ExtendedFloat80 { mant: 13715310171984221708, exp: -1007 },
116     ExtendedFloat80 { mant: 10218702384817765436, exp: -980 },
117     ExtendedFloat80 { mant: 15227053142812498563, exp: -954 },
118     ExtendedFloat80 { mant: 11345038669416679861, exp: -927 },
119     ExtendedFloat80 { mant: 16905424996341287883, exp: -901 },
120     ExtendedFloat80 { mant: 12595523146049147757, exp: -874 },
121     ExtendedFloat80 { mant: 9384396036005875287, exp: -847 },
122     ExtendedFloat80 { mant: 13983839803942852151, exp: -821 },
123     ExtendedFloat80 { mant: 10418772551374772303, exp: -794 },
124     ExtendedFloat80 { mant: 15525180923007089351, exp: -768 },
125     ExtendedFloat80 { mant: 11567161174868858868, exp: -741 },
126     ExtendedFloat80 { mant: 17236413322193710309, exp: -715 },
127     ExtendedFloat80 { mant: 12842128665889583758, exp: -688 },
128     ExtendedFloat80 { mant: 9568131466127621947, exp: -661 },
129     ExtendedFloat80 { mant: 14257626930069360058, exp: -635 },
130     ExtendedFloat80 { mant: 10622759856335341974, exp: -608 },
131     ExtendedFloat80 { mant: 15829145694278690180, exp: -582 },
132     ExtendedFloat80 { mant: 11793632577567316726, exp: -555 },
133     ExtendedFloat80 { mant: 17573882009934360870, exp: -529 },
134     ExtendedFloat80 { mant: 13093562431584567480, exp: -502 },
135     ExtendedFloat80 { mant: 9755464219737475723, exp: -475 },
136     ExtendedFloat80 { mant: 14536774485912137811, exp: -449 },
137     ExtendedFloat80 { mant: 10830740992659433045, exp: -422 },
138     ExtendedFloat80 { mant: 16139061738043178685, exp: -396 },
139     ExtendedFloat80 { mant: 12024538023802026127, exp: -369 },
140     ExtendedFloat80 { mant: 17917957937422433684, exp: -343 },
141     ExtendedFloat80 { mant: 13349918974505688015, exp: -316 },
142     ExtendedFloat80 { mant: 9946464728195732843, exp: -289 },
143     ExtendedFloat80 { mant: 14821387422376473014, exp: -263 },
144     ExtendedFloat80 { mant: 11042794154864902060, exp: -236 },
145     ExtendedFloat80 { mant: 16455045573212060422, exp: -210 },
146     ExtendedFloat80 { mant: 12259964326927110867, exp: -183 },
147     ExtendedFloat80 { mant: 18268770466636286478, exp: -157 },
148     ExtendedFloat80 { mant: 13611294676837538539, exp: -130 },
149     ExtendedFloat80 { mant: 10141204801825835212, exp: -103 },
150     ExtendedFloat80 { mant: 15111572745182864684, exp: -77 },
151     ExtendedFloat80 { mant: 11258999068426240000, exp: -50 },
152     ExtendedFloat80 { mant: 16777216000000000000, exp: -24 },
153     ExtendedFloat80 { mant: 12500000000000000000, exp:  3 },
154     ExtendedFloat80 { mant: 9313225746154785156, exp:  30 },
155     ExtendedFloat80 { mant: 13877787807814456755, exp: 56 },
156     ExtendedFloat80 { mant: 10339757656912845936, exp: 83 },
157     ExtendedFloat80 { mant: 15407439555097886824, exp: 109 },
158     ExtendedFloat80 { mant: 11479437019748901445, exp: 136 },
159     ExtendedFloat80 { mant: 17105694144590052135, exp: 162 },
160     ExtendedFloat80 { mant: 12744735289059618216, exp: 189 },
161     ExtendedFloat80 { mant: 9495567745759798747, exp: 216 },
162     ExtendedFloat80 { mant: 14149498560666738074, exp: 242 },
163     ExtendedFloat80 { mant: 10542197943230523224, exp: 269 },
164     ExtendedFloat80 { mant: 15709099088952724970, exp: 295 },
165     ExtendedFloat80 { mant: 11704190886730495818, exp: 322 },
166     ExtendedFloat80 { mant: 17440603504673385349, exp: 348 },
167     ExtendedFloat80 { mant: 12994262207056124023, exp: 375 },
168     ExtendedFloat80 { mant: 9681479787123295682, exp: 402 },
169     ExtendedFloat80 { mant: 14426529090290212157, exp: 428 },
170     ExtendedFloat80 { mant: 10748601772107342003, exp: 455 },
171     ExtendedFloat80 { mant: 16016664761464807395, exp: 481 },
172     ExtendedFloat80 { mant: 11933345169920330789, exp: 508 },
173     ExtendedFloat80 { mant: 17782069995880619868, exp: 534 },
174     ExtendedFloat80 { mant: 13248674568444952270, exp: 561 },
175     ExtendedFloat80 { mant: 9871031767461413346, exp: 588 },
176     ExtendedFloat80 { mant: 14708983551653345445, exp: 614 },
177     ExtendedFloat80 { mant: 10959046745042015199, exp: 641 },
178     ExtendedFloat80 { mant: 16330252207878254650, exp: 667 },
179     ExtendedFloat80 { mant: 12166986024289022870, exp: 694 },
180     ExtendedFloat80 { mant: 18130221999122236476, exp: 720 },
181     ExtendedFloat80 { mant: 13508068024458167312, exp: 747 },
182     ExtendedFloat80 { mant: 10064294952495520794, exp: 774 },
183     ExtendedFloat80 { mant: 14996968138956309548, exp: 800 },
184     ExtendedFloat80 { mant: 11173611982879273257, exp: 827 },
185     ExtendedFloat80 { mant: 16649979327439178909, exp: 853 },
186     ExtendedFloat80 { mant: 12405201291620119593, exp: 880 },
187     ExtendedFloat80 { mant: 9242595204427927429, exp: 907 },
188     ExtendedFloat80 { mant: 13772540099066387757, exp: 933 },
189     ExtendedFloat80 { mant: 10261342003245940623, exp: 960 },
190     ExtendedFloat80 { mant: 15290591125556738113, exp: 986 },
191     ExtendedFloat80 { mant: 11392378155556871081, exp: 1013 },
192     ExtendedFloat80 { mant: 16975966327722178521, exp: 1039 },
193     ExtendedFloat80 { mant: 12648080533535911531, exp: 1066 }
194 ];
195 
196 // FTOA DECIMAL
197 
198 // LOOKUPS
199 const TENS: [u64; 20] = [
200     10000000000000000000, 1000000000000000000, 100000000000000000,
201     10000000000000000, 1000000000000000, 100000000000000,
202     10000000000000, 1000000000000, 100000000000,
203     10000000000, 1000000000, 100000000,
204     10000000, 1000000, 100000,
205     10000, 1000, 100,
206     10, 1
207 ];
208 
209 // FPCONV GRISU
210 
211 perftools_inline!{
212 /// Round digit to sane approximation.
213 fn round_digit(digits: &mut [u8], ndigits: usize, delta: u64, mut rem: u64, kappa: u64, mant: u64)
214 {
215     while rem < mant && delta - rem >= kappa && (rem + kappa < mant || mant - rem > rem + kappa - mant)
216     {
217         digits[ndigits - 1] -= 1;
218         rem += kappa;
219     }
220 }}
221 
222 /// Generate digits from upper and lower range on rounding of number.
generate_digits(fp: &ExtendedFloat80, upper: &ExtendedFloat80, lower: &ExtendedFloat80, digits: &mut [u8], k: &mut i32) -> usize223 fn generate_digits(fp: &ExtendedFloat80, upper: &ExtendedFloat80, lower: &ExtendedFloat80, digits: &mut [u8], k: &mut i32)
224     -> usize
225 {
226     let wmant = upper.mant - fp.mant;
227     let mut delta = upper.mant - lower.mant;
228 
229     let one = ExtendedFloat80 {
230         mant: 1 << -upper.exp,
231         exp: upper.exp,
232     };
233 
234     let mut part1 = upper.mant >> -one.exp;
235     let mut part2 = upper.mant & (one.mant - 1);
236 
237     let mut idx: usize = 0;
238     let mut kappa: i32 = 10;
239     // 1000000000
240     // Guaranteed to be safe, TENS has 20 elements.
241     let mut divp = index!(TENS[10..]).iter();
242     while kappa > 0 {
243         // Remember not to continue! This loop has an increment condition.
244         let div = divp.next().unwrap();
245         let digit = part1 / div;
246         if digit != 0 || idx != 0 {
247             digits[idx] = digit.as_u8() + b'0';
248             idx += 1;
249         }
250 
251         part1 -= digit.as_u64() * div;
252         kappa -= 1;
253 
254         let tmp = (part1 <<-one.exp) + part2;
255         if tmp <= delta {
256             *k += kappa;
257             round_digit(digits, idx, delta, tmp, div << -one.exp, wmant);
258             return idx;
259         }
260     }
261 
262     /* 10 */
263     // Guaranteed to be safe, TENS has 20 elements.
264     let mut unit = index!(TENS[..=18]).iter().rev();
265     loop {
266         part2 *= 10;
267         delta *= 10;
268         kappa -= 1;
269 
270         let digit = part2 >> -one.exp;
271         if digit != 0 || idx != 0 {
272             digits[idx] = digit.as_u8() + b'0';
273             idx += 1;
274         }
275 
276         part2 &= one.mant - 1;
277         let ten = unit.next().unwrap();
278         if part2 < delta {
279             *k += kappa;
280             round_digit(digits, idx, delta, part2, one.mant, wmant * ten);
281             return idx;
282         }
283     }
284 }
285 
286 /// Core Grisu2 algorithm for the float formatter.
grisu2(d: f64, digits: &mut [u8], k: &mut i32) -> usize287 fn grisu2(d: f64, digits: &mut [u8], k: &mut i32)
288     -> usize
289 {
290     let mut w = ExtendedFloat80::from_f64(d);
291 
292     let (mut lower, mut upper) = w.normalized_boundaries();
293     w.normalize();
294 
295     let mut ki: i32 =  0;
296     let cp = cached_grisu_power(upper.exp, &mut ki);
297 
298     w     = w.mul(cp);
299     upper = upper.mul(cp);
300     lower = lower.mul(cp);
301 
302     lower.mant += 1;
303     upper.mant -= 1;
304 
305     *k = -ki;
306 
307     generate_digits(&w, &upper, &lower, digits, k)
308 }
309 
310 /// Write the produced digits to string.
311 ///
312 /// Adds formatting for exponents, and other types of information.
emit_digits(digits: &mut [u8], mut ndigits: usize, dest: &mut [u8], k: i32) -> usize313 fn emit_digits(digits: &mut [u8], mut ndigits: usize, dest: &mut [u8], k: i32)
314     -> usize
315 {
316     let exp = k + ndigits.as_i32() - 1;
317     let mut exp = exp.abs().as_usize();
318 
319     // write plain integer (with ".0" suffix).
320     if k >= 0 && exp < (ndigits + 7) {
321         let idx = ndigits;
322         let count = k.as_usize();
323         // These are all safe, since digits.len() >= idx, and
324         // dest.len() >= idx+count+2, so the range must be valid.
325         copy_to_dst(dest, &index!(digits[..idx]));
326         write_bytes(&mut index_mut!(dest[idx..idx+count]), b'0');
327         copy_to_dst(&mut index_mut!(dest[idx+count..]), b".0");
328 
329         return ndigits + k.as_usize() + 2;
330     }
331 
332     // write decimal w/o scientific notation
333     if k < 0 && (k > -7 || exp < 4) {
334         let offset = ndigits.as_isize() - k.abs().as_isize();
335         // fp < 1.0 -> write leading zero
336         if offset <= 0 {
337             let offset = (-offset).as_usize();
338             // These are all safe, since digits.len() >= ndigits, and
339             // dest.len() >= ndigits+offset+2, so the range must be valid.
340             index_mut!(dest[0] = b'0');
341             index_mut!(dest[1] = b'.');
342             write_bytes(&mut index_mut!(dest[2..offset+2]), b'0');
343             copy_to_dst(&mut index_mut!(dest[offset+2..]), &index!(digits[..ndigits]));
344 
345             return ndigits + 2 + offset;
346 
347         } else {
348             // fp > 1.0
349             let offset = offset.as_usize();
350             // These are all safe, since digits.len() >= ndigits, and
351             // dest.len() >= ndigits+1, so the range must be valid.
352             copy_to_dst(dest, &index!(digits[..offset]));
353             index_mut!(dest[offset] = b'.');
354             copy_to_dst(&mut index_mut!(dest[offset+1..]), &index!(digits[offset..ndigits]));
355 
356             return ndigits + 1;
357         }
358     }
359 
360     // write decimal w/ scientific notation
361     ndigits = ndigits.min(18);
362 
363     let dst_len = dest.len();
364     let mut dst_iter = dest.iter_mut();
365     let mut src_iter = digits.iter().take(ndigits);
366     *dst_iter.next().unwrap() = *src_iter.next().unwrap();
367 
368     if ndigits > 1 {
369         *dst_iter.next().unwrap() = b'.';
370         for &src in src_iter {
371             *dst_iter.next().unwrap() = src;
372         }
373     }
374 
375     *dst_iter.next().unwrap() = exponent_notation_char(10);
376 
377     *dst_iter.next().unwrap() = match k + ndigits.as_i32() - 1 < 0 {
378         true    => b'-',
379         false   => b'+',
380     };
381 
382     let mut cent: usize = 0;
383     if exp > 99 {
384         cent = exp / 100;
385         *dst_iter.next().unwrap() = cent.as_u8() + b'0';
386         exp -= cent * 100;
387     }
388     if exp > 9 {
389         let dec = exp / 10;
390         *dst_iter.next().unwrap() = dec.as_u8() + b'0';
391         exp -= dec * 10;
392     } else if cent != 0 {
393         *dst_iter.next().unwrap() = b'0';
394     }
395 
396     let shift = (exp % 10).as_u8();
397     *dst_iter.next().unwrap() = shift + b'0';
398 
399     dst_len - dst_iter.count()
400 }
401 
402 perftools_inline!{
403 fn fpconv_dtoa(d: f64, dest: &mut [u8]) -> usize
404 {
405     let mut digits: [u8; 18] = [0; 18];
406     let mut k: i32 = 0;
407     let ndigits = grisu2(d, &mut digits, &mut k);
408     emit_digits(&mut digits, ndigits, dest, k)
409 }}
410 
411 // DECIMAL
412 
413 perftools_inline!{
414 /// Forward to double_decimal.
415 ///
416 /// `f` must be non-special (NaN or infinite), non-negative,
417 /// and non-zero.
418 pub(crate) fn float_decimal<'a>(f: f32, bytes: &'a mut [u8])
419     -> usize
420 {
421     double_decimal(f.as_f64(), bytes)
422 }}
423 
424 // F64
425 
426 perftools_inline!{
427 /// Optimized algorithm for decimal numbers.
428 ///
429 /// `d` must be non-special (NaN or infinite), non-negative,
430 /// and non-zero.
431 pub(crate) fn double_decimal<'a>(d: f64, bytes: &'a mut [u8])
432     -> usize
433 {
434     fpconv_dtoa(d, bytes)
435 }}
436