1 //! Internal implementation of the Grisu2 algorithm.
2 //!
3 //! The optimized routines are adapted from Andrea Samoljuk's `fpconv` library,
4 //! which is available [here](https://github.com/night-shift/fpconv).
5 //!
6 //! The following benchmarks were run on an "Intel(R) Core(TM) i7-6560U
7 //! CPU @ 2.20GHz" CPU, on Fedora 28, Linux kernel version 4.18.16-200
8 //! (x86-64), using the lexical formatter, `dtoa::write()` or `x.to_string()`,
9 //! avoiding any inefficiencies in Rust string parsing for `format!(...)`
10 //! or `write!()` macros. The code was compiled with LTO and at an optimization
11 //! level of 3.
12 //!
13 //! The benchmarks with `std` were compiled using "rustc 1.29.2 (17a9dc751
14 //! 2018-10-05", and the `no_std` benchmarks were compiled using "rustc
15 //! 1.31.0-nightly (46880f41b 2018-10-15)".
16 //!
17 //! The benchmark code may be found `benches/ftoa.rs`.
18 //!
19 //! # Benchmarks
20 //!
21 //! | Type | lexical (ns/iter) | to_string (ns/iter) | Relative Increase |
22 //! |:-----:|:------------------:|:---------------------:|:-----------------:|
23 //! | f32 | 1,221,025 | 2,711,290 | 2.22x |
24 //! | f64 | 1,248,397 | 3,558,305 | 2.85x |
25 //!
26 //! # Raw Benchmarks
27 //!
28 //! ```text
29 //! test f32_dtoa ... bench: 1,174,070 ns/iter (+/- 442,501)
30 //! test f32_lexical ... bench: 1,433,234 ns/iter (+/- 633,261)
31 //! test f32_ryu ... bench: 669,828 ns/iter (+/- 192,291)
32 //! test f32_to_string ... bench: 3,341,733 ns/iter (+/- 1,346,744)
33 //! test f64_dtoa ... bench: 1,302,522 ns/iter (+/- 364,655)
34 //! test f64_lexical ... bench: 1,375,384 ns/iter (+/- 596,860)
35 //! test f64_ryu ... bench: 1,015,171 ns/iter (+/- 187,552)
36 //! test f64_to_string ... bench: 3,900,299 ns/iter (+/- 521,956)
37 //! ```
38
39 // Code the generate the benchmark plot:
40 // import numpy as np
41 // import pandas as pd
42 // import matplotlib.pyplot as plt
43 // plt.style.use('ggplot')
44 // lexical = np.array([1221025, 1375384]) / 1e6
45 // dtoa = np.array([1174070, 1302522]) / 1e6
46 // ryu = np.array([669828, 1015171]) / 1e6
47 // rustcore = np.array([2711290, 3558305]) / 1e6
48 // index = ["f32", "f64"]
49 // df = pd.DataFrame({'lexical': lexical, 'lexical (dtoa)': dtoa, 'lexical (ryu)': ryu, 'rustcore': rustcore}, index = index, columns=['lexical', 'lexical (dtoa)', 'lexical (ryu)', 'rustcore'])
50 // ax = df.plot.bar(rot=0, figsize=(16, 8), fontsize=14, color=['#E24A33', '#988ED5', '#8EBA42', '#348ABD'])
51 // ax.set_ylabel("ms/iter")
52 // ax.figure.tight_layout()
53 // ax.legend(loc=2, prop={'size': 14})
54 // plt.show()
55
56 use crate::float::ExtendedFloat80;
57 use crate::util::*;
58
59 // CACHED POWERS
60
61 perftools_inline!{
62 /// Find cached power of 10 from the exponent.
63 fn cached_grisu_power(exp: i32, k: &mut i32)
64 -> &'static ExtendedFloat80
65 {
66 // FLOATING POINT CONSTANTS
67 const ONE_LOG_TEN: f64 = 0.30102999566398114;
68 const NPOWERS: i32 = 87;
69 const FIRSTPOWER: i32 = -348; // 10 ^ -348
70 const STEPPOWERS: i32 = 8;
71 const EXPMAX: i32 = -32;
72 const EXPMIN: i32 = -60;
73
74 let approx = -((exp + NPOWERS).as_f64()) * ONE_LOG_TEN;
75 let approx = approx.as_i32();
76 let mut idx = ((approx - FIRSTPOWER) / STEPPOWERS).as_usize();
77
78 loop {
79 // Use `arr.get(idx)`, which explicitly provides a reference,
80 // instead of `arr[idx]`, which provides a value.
81 // We have a bug in versions <= 1.27.0 where it creates
82 // a local copy, which we then get the reference to and return.
83 // This allows use-after-free, without any warning, so we're
84 // using unidiomatic code to avoid any issue.
85 let power = GRISU_POWERS_OF_TEN.get(idx).unwrap();
86 let current = exp + power.exp + 64;
87 if current < EXPMIN {
88 idx += 1;
89 continue;
90 }
91
92 if current > EXPMAX {
93 idx -= 1;
94 continue;
95 }
96
97 *k = FIRSTPOWER + idx.as_i32() * STEPPOWERS;
98 return power;
99 }
100 }}
101
102 /// Cached powers of ten as specified by the Grisu algorithm.
103 ///
104 /// Cached powers of 10^k, calculated as if by:
105 /// `ceil((alpha-e+63) * ONE_LOG_TEN);`
106 const GRISU_POWERS_OF_TEN: [ExtendedFloat80; 87] = [
107 ExtendedFloat80 { mant: 18054884314459144840, exp: -1220 },
108 ExtendedFloat80 { mant: 13451937075301367670, exp: -1193 },
109 ExtendedFloat80 { mant: 10022474136428063862, exp: -1166 },
110 ExtendedFloat80 { mant: 14934650266808366570, exp: -1140 },
111 ExtendedFloat80 { mant: 11127181549972568877, exp: -1113 },
112 ExtendedFloat80 { mant: 16580792590934885855, exp: -1087 },
113 ExtendedFloat80 { mant: 12353653155963782858, exp: -1060 },
114 ExtendedFloat80 { mant: 18408377700990114895, exp: -1034 },
115 ExtendedFloat80 { mant: 13715310171984221708, exp: -1007 },
116 ExtendedFloat80 { mant: 10218702384817765436, exp: -980 },
117 ExtendedFloat80 { mant: 15227053142812498563, exp: -954 },
118 ExtendedFloat80 { mant: 11345038669416679861, exp: -927 },
119 ExtendedFloat80 { mant: 16905424996341287883, exp: -901 },
120 ExtendedFloat80 { mant: 12595523146049147757, exp: -874 },
121 ExtendedFloat80 { mant: 9384396036005875287, exp: -847 },
122 ExtendedFloat80 { mant: 13983839803942852151, exp: -821 },
123 ExtendedFloat80 { mant: 10418772551374772303, exp: -794 },
124 ExtendedFloat80 { mant: 15525180923007089351, exp: -768 },
125 ExtendedFloat80 { mant: 11567161174868858868, exp: -741 },
126 ExtendedFloat80 { mant: 17236413322193710309, exp: -715 },
127 ExtendedFloat80 { mant: 12842128665889583758, exp: -688 },
128 ExtendedFloat80 { mant: 9568131466127621947, exp: -661 },
129 ExtendedFloat80 { mant: 14257626930069360058, exp: -635 },
130 ExtendedFloat80 { mant: 10622759856335341974, exp: -608 },
131 ExtendedFloat80 { mant: 15829145694278690180, exp: -582 },
132 ExtendedFloat80 { mant: 11793632577567316726, exp: -555 },
133 ExtendedFloat80 { mant: 17573882009934360870, exp: -529 },
134 ExtendedFloat80 { mant: 13093562431584567480, exp: -502 },
135 ExtendedFloat80 { mant: 9755464219737475723, exp: -475 },
136 ExtendedFloat80 { mant: 14536774485912137811, exp: -449 },
137 ExtendedFloat80 { mant: 10830740992659433045, exp: -422 },
138 ExtendedFloat80 { mant: 16139061738043178685, exp: -396 },
139 ExtendedFloat80 { mant: 12024538023802026127, exp: -369 },
140 ExtendedFloat80 { mant: 17917957937422433684, exp: -343 },
141 ExtendedFloat80 { mant: 13349918974505688015, exp: -316 },
142 ExtendedFloat80 { mant: 9946464728195732843, exp: -289 },
143 ExtendedFloat80 { mant: 14821387422376473014, exp: -263 },
144 ExtendedFloat80 { mant: 11042794154864902060, exp: -236 },
145 ExtendedFloat80 { mant: 16455045573212060422, exp: -210 },
146 ExtendedFloat80 { mant: 12259964326927110867, exp: -183 },
147 ExtendedFloat80 { mant: 18268770466636286478, exp: -157 },
148 ExtendedFloat80 { mant: 13611294676837538539, exp: -130 },
149 ExtendedFloat80 { mant: 10141204801825835212, exp: -103 },
150 ExtendedFloat80 { mant: 15111572745182864684, exp: -77 },
151 ExtendedFloat80 { mant: 11258999068426240000, exp: -50 },
152 ExtendedFloat80 { mant: 16777216000000000000, exp: -24 },
153 ExtendedFloat80 { mant: 12500000000000000000, exp: 3 },
154 ExtendedFloat80 { mant: 9313225746154785156, exp: 30 },
155 ExtendedFloat80 { mant: 13877787807814456755, exp: 56 },
156 ExtendedFloat80 { mant: 10339757656912845936, exp: 83 },
157 ExtendedFloat80 { mant: 15407439555097886824, exp: 109 },
158 ExtendedFloat80 { mant: 11479437019748901445, exp: 136 },
159 ExtendedFloat80 { mant: 17105694144590052135, exp: 162 },
160 ExtendedFloat80 { mant: 12744735289059618216, exp: 189 },
161 ExtendedFloat80 { mant: 9495567745759798747, exp: 216 },
162 ExtendedFloat80 { mant: 14149498560666738074, exp: 242 },
163 ExtendedFloat80 { mant: 10542197943230523224, exp: 269 },
164 ExtendedFloat80 { mant: 15709099088952724970, exp: 295 },
165 ExtendedFloat80 { mant: 11704190886730495818, exp: 322 },
166 ExtendedFloat80 { mant: 17440603504673385349, exp: 348 },
167 ExtendedFloat80 { mant: 12994262207056124023, exp: 375 },
168 ExtendedFloat80 { mant: 9681479787123295682, exp: 402 },
169 ExtendedFloat80 { mant: 14426529090290212157, exp: 428 },
170 ExtendedFloat80 { mant: 10748601772107342003, exp: 455 },
171 ExtendedFloat80 { mant: 16016664761464807395, exp: 481 },
172 ExtendedFloat80 { mant: 11933345169920330789, exp: 508 },
173 ExtendedFloat80 { mant: 17782069995880619868, exp: 534 },
174 ExtendedFloat80 { mant: 13248674568444952270, exp: 561 },
175 ExtendedFloat80 { mant: 9871031767461413346, exp: 588 },
176 ExtendedFloat80 { mant: 14708983551653345445, exp: 614 },
177 ExtendedFloat80 { mant: 10959046745042015199, exp: 641 },
178 ExtendedFloat80 { mant: 16330252207878254650, exp: 667 },
179 ExtendedFloat80 { mant: 12166986024289022870, exp: 694 },
180 ExtendedFloat80 { mant: 18130221999122236476, exp: 720 },
181 ExtendedFloat80 { mant: 13508068024458167312, exp: 747 },
182 ExtendedFloat80 { mant: 10064294952495520794, exp: 774 },
183 ExtendedFloat80 { mant: 14996968138956309548, exp: 800 },
184 ExtendedFloat80 { mant: 11173611982879273257, exp: 827 },
185 ExtendedFloat80 { mant: 16649979327439178909, exp: 853 },
186 ExtendedFloat80 { mant: 12405201291620119593, exp: 880 },
187 ExtendedFloat80 { mant: 9242595204427927429, exp: 907 },
188 ExtendedFloat80 { mant: 13772540099066387757, exp: 933 },
189 ExtendedFloat80 { mant: 10261342003245940623, exp: 960 },
190 ExtendedFloat80 { mant: 15290591125556738113, exp: 986 },
191 ExtendedFloat80 { mant: 11392378155556871081, exp: 1013 },
192 ExtendedFloat80 { mant: 16975966327722178521, exp: 1039 },
193 ExtendedFloat80 { mant: 12648080533535911531, exp: 1066 }
194 ];
195
196 // FTOA DECIMAL
197
198 // LOOKUPS
199 const TENS: [u64; 20] = [
200 10000000000000000000, 1000000000000000000, 100000000000000000,
201 10000000000000000, 1000000000000000, 100000000000000,
202 10000000000000, 1000000000000, 100000000000,
203 10000000000, 1000000000, 100000000,
204 10000000, 1000000, 100000,
205 10000, 1000, 100,
206 10, 1
207 ];
208
209 // FPCONV GRISU
210
211 perftools_inline!{
212 /// Round digit to sane approximation.
213 fn round_digit(digits: &mut [u8], ndigits: usize, delta: u64, mut rem: u64, kappa: u64, mant: u64)
214 {
215 while rem < mant && delta - rem >= kappa && (rem + kappa < mant || mant - rem > rem + kappa - mant)
216 {
217 digits[ndigits - 1] -= 1;
218 rem += kappa;
219 }
220 }}
221
222 /// Generate digits from upper and lower range on rounding of number.
generate_digits(fp: &ExtendedFloat80, upper: &ExtendedFloat80, lower: &ExtendedFloat80, digits: &mut [u8], k: &mut i32) -> usize223 fn generate_digits(fp: &ExtendedFloat80, upper: &ExtendedFloat80, lower: &ExtendedFloat80, digits: &mut [u8], k: &mut i32)
224 -> usize
225 {
226 let wmant = upper.mant - fp.mant;
227 let mut delta = upper.mant - lower.mant;
228
229 let one = ExtendedFloat80 {
230 mant: 1 << -upper.exp,
231 exp: upper.exp,
232 };
233
234 let mut part1 = upper.mant >> -one.exp;
235 let mut part2 = upper.mant & (one.mant - 1);
236
237 let mut idx: usize = 0;
238 let mut kappa: i32 = 10;
239 // 1000000000
240 // Guaranteed to be safe, TENS has 20 elements.
241 let mut divp = index!(TENS[10..]).iter();
242 while kappa > 0 {
243 // Remember not to continue! This loop has an increment condition.
244 let div = divp.next().unwrap();
245 let digit = part1 / div;
246 if digit != 0 || idx != 0 {
247 digits[idx] = digit.as_u8() + b'0';
248 idx += 1;
249 }
250
251 part1 -= digit.as_u64() * div;
252 kappa -= 1;
253
254 let tmp = (part1 <<-one.exp) + part2;
255 if tmp <= delta {
256 *k += kappa;
257 round_digit(digits, idx, delta, tmp, div << -one.exp, wmant);
258 return idx;
259 }
260 }
261
262 /* 10 */
263 // Guaranteed to be safe, TENS has 20 elements.
264 let mut unit = index!(TENS[..=18]).iter().rev();
265 loop {
266 part2 *= 10;
267 delta *= 10;
268 kappa -= 1;
269
270 let digit = part2 >> -one.exp;
271 if digit != 0 || idx != 0 {
272 digits[idx] = digit.as_u8() + b'0';
273 idx += 1;
274 }
275
276 part2 &= one.mant - 1;
277 let ten = unit.next().unwrap();
278 if part2 < delta {
279 *k += kappa;
280 round_digit(digits, idx, delta, part2, one.mant, wmant * ten);
281 return idx;
282 }
283 }
284 }
285
286 /// Core Grisu2 algorithm for the float formatter.
grisu2(d: f64, digits: &mut [u8], k: &mut i32) -> usize287 fn grisu2(d: f64, digits: &mut [u8], k: &mut i32)
288 -> usize
289 {
290 let mut w = ExtendedFloat80::from_f64(d);
291
292 let (mut lower, mut upper) = w.normalized_boundaries();
293 w.normalize();
294
295 let mut ki: i32 = 0;
296 let cp = cached_grisu_power(upper.exp, &mut ki);
297
298 w = w.mul(cp);
299 upper = upper.mul(cp);
300 lower = lower.mul(cp);
301
302 lower.mant += 1;
303 upper.mant -= 1;
304
305 *k = -ki;
306
307 generate_digits(&w, &upper, &lower, digits, k)
308 }
309
310 /// Write the produced digits to string.
311 ///
312 /// Adds formatting for exponents, and other types of information.
emit_digits(digits: &mut [u8], mut ndigits: usize, dest: &mut [u8], k: i32) -> usize313 fn emit_digits(digits: &mut [u8], mut ndigits: usize, dest: &mut [u8], k: i32)
314 -> usize
315 {
316 let exp = k + ndigits.as_i32() - 1;
317 let mut exp = exp.abs().as_usize();
318
319 // write plain integer (with ".0" suffix).
320 if k >= 0 && exp < (ndigits + 7) {
321 let idx = ndigits;
322 let count = k.as_usize();
323 // These are all safe, since digits.len() >= idx, and
324 // dest.len() >= idx+count+2, so the range must be valid.
325 copy_to_dst(dest, &index!(digits[..idx]));
326 write_bytes(&mut index_mut!(dest[idx..idx+count]), b'0');
327 copy_to_dst(&mut index_mut!(dest[idx+count..]), b".0");
328
329 return ndigits + k.as_usize() + 2;
330 }
331
332 // write decimal w/o scientific notation
333 if k < 0 && (k > -7 || exp < 4) {
334 let offset = ndigits.as_isize() - k.abs().as_isize();
335 // fp < 1.0 -> write leading zero
336 if offset <= 0 {
337 let offset = (-offset).as_usize();
338 // These are all safe, since digits.len() >= ndigits, and
339 // dest.len() >= ndigits+offset+2, so the range must be valid.
340 index_mut!(dest[0] = b'0');
341 index_mut!(dest[1] = b'.');
342 write_bytes(&mut index_mut!(dest[2..offset+2]), b'0');
343 copy_to_dst(&mut index_mut!(dest[offset+2..]), &index!(digits[..ndigits]));
344
345 return ndigits + 2 + offset;
346
347 } else {
348 // fp > 1.0
349 let offset = offset.as_usize();
350 // These are all safe, since digits.len() >= ndigits, and
351 // dest.len() >= ndigits+1, so the range must be valid.
352 copy_to_dst(dest, &index!(digits[..offset]));
353 index_mut!(dest[offset] = b'.');
354 copy_to_dst(&mut index_mut!(dest[offset+1..]), &index!(digits[offset..ndigits]));
355
356 return ndigits + 1;
357 }
358 }
359
360 // write decimal w/ scientific notation
361 ndigits = ndigits.min(18);
362
363 let dst_len = dest.len();
364 let mut dst_iter = dest.iter_mut();
365 let mut src_iter = digits.iter().take(ndigits);
366 *dst_iter.next().unwrap() = *src_iter.next().unwrap();
367
368 if ndigits > 1 {
369 *dst_iter.next().unwrap() = b'.';
370 for &src in src_iter {
371 *dst_iter.next().unwrap() = src;
372 }
373 }
374
375 *dst_iter.next().unwrap() = exponent_notation_char(10);
376
377 *dst_iter.next().unwrap() = match k + ndigits.as_i32() - 1 < 0 {
378 true => b'-',
379 false => b'+',
380 };
381
382 let mut cent: usize = 0;
383 if exp > 99 {
384 cent = exp / 100;
385 *dst_iter.next().unwrap() = cent.as_u8() + b'0';
386 exp -= cent * 100;
387 }
388 if exp > 9 {
389 let dec = exp / 10;
390 *dst_iter.next().unwrap() = dec.as_u8() + b'0';
391 exp -= dec * 10;
392 } else if cent != 0 {
393 *dst_iter.next().unwrap() = b'0';
394 }
395
396 let shift = (exp % 10).as_u8();
397 *dst_iter.next().unwrap() = shift + b'0';
398
399 dst_len - dst_iter.count()
400 }
401
402 perftools_inline!{
403 fn fpconv_dtoa(d: f64, dest: &mut [u8]) -> usize
404 {
405 let mut digits: [u8; 18] = [0; 18];
406 let mut k: i32 = 0;
407 let ndigits = grisu2(d, &mut digits, &mut k);
408 emit_digits(&mut digits, ndigits, dest, k)
409 }}
410
411 // DECIMAL
412
413 perftools_inline!{
414 /// Forward to double_decimal.
415 ///
416 /// `f` must be non-special (NaN or infinite), non-negative,
417 /// and non-zero.
418 pub(crate) fn float_decimal<'a>(f: f32, bytes: &'a mut [u8])
419 -> usize
420 {
421 double_decimal(f.as_f64(), bytes)
422 }}
423
424 // F64
425
426 perftools_inline!{
427 /// Optimized algorithm for decimal numbers.
428 ///
429 /// `d` must be non-special (NaN or infinite), non-negative,
430 /// and non-zero.
431 pub(crate) fn double_decimal<'a>(d: f64, bytes: &'a mut [u8])
432 -> usize
433 {
434 fpconv_dtoa(d, bytes)
435 }}
436