1 //! Fast estimation of the accurate representation of a float.
2 //!
3 //! Based off the Golang implementation of the Eisel-Lemire algorithm,
4 //! found here:
5 //!     https://github.com/golang/go/blob/2ebe77a2fda1ee9ff6fd9a3e08933ad1ebaea039/src/strconv/eisel_lemire.go
6 //!
7 //! Which, itself was based off of the Wuff's implementation:
8 //!     https://github.com/google/wuffs/blob/ba3818cb6b473a2ed0b38ecfc07dbbd3a97e8ae7/internal/cgen/base/floatconv-submodule-code.c
9 //!
10 //! The original algorithm may be found here:
11 //!     https://github.com/lemire/fast_double_parser
12 //!
13 //! And an in-depth blogpost describing the algorithms may be found here:
14 //!     https://nigeltao.github.io/blog/2020/eisel-lemire.html
15 //!
16 //! # Magic Number Generation
17 //!
18 //! ```python
19 //! import math
20 //!
21 //! def get_range(max_exp, bitshift):
22 //!     den = 1 << bitshift
23 //!     num = int(math.ceil(math.log2(10) * den))
24 //!     for exp10 in range(0, max_exp):
25 //!         exp2_exact = int(math.log2(10**exp10))
26 //!         exp2_guess = num * exp10 // den
27 //!         if exp2_exact != exp2_guess:
28 //!             raise ValueError(f'{exp10}')
29 //!     return num, den
30 //! ```
31 //!
32 //! For 64-bit and smaller floats, we therefore need a bitshift of 16,
33 //! so our magic number is `217706`. For 128-bit floats, we need a bitshift
34 //! of >= 25, so we round up to 32, and therefore need a magic number
35 //! of `14267572528`. Note that due to the storage requirements,
36 //! 128-bit floats do not currently use this algorithm.
37 
38 use super::extended_float;
39 use super::num::*;
40 use super::powers::*;
41 
42 // MUL
43 // ---
44 
45 /// Multiply two unsigned, integral values, and return the hi and lo product.
46 #[inline(always)]
mul(x: u64, y: u64) -> (u64, u64)47 pub(crate) fn mul(x: u64, y: u64) -> (u64, u64) {
48     // Extract high-and-low masks.
49     let x1 = x >> u64::HALF;
50     let x0 = x & u64::LOMASK;
51     let y1 = y >> u64::HALF;
52     let y0 = y & u64::LOMASK;
53 
54     // Get our products
55     let w0 = x0 * y0;
56     let tmp = (x1 * y0) + (w0 >> u64::HALF);
57     let w1 = tmp & u64::LOMASK;
58     let w2 = tmp >> u64::HALF;
59     let w1 = w1 + x0 * y1;
60     let hi = (x1 * y1) + w2 + (w1 >> u64::HALF);
61     let lo = x.wrapping_mul(y);
62 
63     (hi, lo)
64 }
65 
66 // SHIFT
67 // -----
68 
69 /// Shift significant digits to at most the carry bit
70 /// The carry bit is 1 above the hidden bit, in the exponent,
71 /// or mantissa size + 2.
72 #[inline(always)]
shift_to_carry(x_hi: u64, exp2: i32, carry_shift: i32) -> (u64, i32)73 fn shift_to_carry(x_hi: u64, exp2: i32, carry_shift: i32) -> (u64, i32) {
74     // Carry out the shift
75     let msb_shift = u64::FULL - 1;
76     let msb = x_hi >> msb_shift;
77     let shift = msb.as_i32() + carry_shift;
78     let mantissa = x_hi >> shift;
79     let exp2 = exp2 - (1i32 ^ msb.as_i32());
80 
81     (mantissa, exp2)
82 }
83 
84 // TO FLOAT
85 // --------
86 
87 /// Convert mantissa and binary exponent to floating-point representation.
88 ///
89 /// This function expects the following things:
90 ///     1). The highest mantissa bit set is 1 above the carry bit.
91 ///     2). The lowest mantissa bit set is the carry bit.
92 ///         That is, 2 above the hidden bit, or 1 above the hidden bit.
93 ///     3). The binary exponent is adjusted for the exponent bias.
94 #[inline(always)]
to_float<F>(mantissa: u64, exp: i32) -> (F, bool) where F: Float,95 fn to_float<F>(mantissa: u64, exp: i32) -> (F, bool)
96 where
97     F: Float,
98 {
99     // Check denormal values for underflow.
100     if exp <= -(F::MANTISSA_SIZE + 2) {
101         // Have a literal zero. If we shift the bits, we'll get 0.
102         return (F::ZERO, true);
103     } else if exp <= 0 {
104         // We don't actually care about the accuracy here,
105         // since we're going straight to the extended-float algorithm.
106         return (F::ZERO, false);
107     }
108 
109     // Get our raw bits.
110     let mut exp = F::Unsigned::as_cast(exp);
111     let mut mantissa = F::Unsigned::as_cast(mantissa);
112 
113     // Round-nearest, tie-even.
114     let zero = F::Unsigned::ZERO;
115     let one = F::Unsigned::as_cast(1);
116     mantissa += mantissa & one;
117 
118     // Shift them into position.
119     mantissa >>= 1i32;
120     let precision = F::MANTISSA_SIZE + 1;
121     if mantissa >> precision > zero {
122         mantissa >>= 1i32;
123         exp += one;
124     }
125 
126     // Check our mantissa representation is valid, that is,
127     // we didn't have a bit mantissa or hidden bit set.
128     let mask = F::MANTISSA_MASK | F::HIDDEN_BIT_MASK;
129     debug_assert!(mantissa & mask == mantissa);
130 
131     // Check for overflow, if so, return a literal infinity.
132     let max_exp = F::MAX_EXPONENT + F::EXPONENT_BIAS;
133     if exp >= F::Unsigned::as_cast(max_exp) {
134         let float = F::from_bits(F::INFINITY_BITS);
135         return (float, true);
136     }
137 
138     // Should fail, we shouldn't have any exponent bits set.
139     mantissa &= F::MANTISSA_MASK;
140     exp <<= F::MANTISSA_SIZE;
141     let bits = exp | mantissa;
142 
143     (F::from_bits(bits), true)
144 }
145 
146 // EISEL-LEMIRE
147 // ------------
148 
149 /// Create a precise native float using the Eisel-Lemire algorithm.
150 ///
151 /// NOTE: If the Eisel-Lamire algorithm cannot differentiate a halfway
152 /// representation, it cannot determine whether to round up or down
153 /// to determine the correct `b` value for big-float determination.
154 ///
155 /// In that case, we fall back to extended-float to determine that
156 /// representation.
157 #[inline]
eisel_lemire<F>(mantissa: u64, exponent: i32) -> (F, bool) where F: Float,158 pub(super) fn eisel_lemire<F>(mantissa: u64, exponent: i32) -> (F, bool)
159 where
160     F: Float,
161 {
162     // Check if the value is outside of our max range:
163     //  If the value is above our max range, we have to have an infinity,
164     //  and we have an exact representation (1e348) is infinity, which
165     //  is the minimum possible value above this range.
166     //
167     // For negative values, we're guaranteed to have 0 as well:
168     //  with 2470328229206232720e-342, we round to 0, while with
169     //  2470328229206232721e-342, we round up to 5e-324. Both of these
170     //  contain the maximum number of mantissa digits (19), so our
171     //  base-10 exponent cannot get any lower.
172     //
173     // Note that this only checks beyond the limits of f64, we do
174     // checks for narrower types further in.
175     if exponent < MIN_DENORMAL_EXP10 {
176         return (F::ZERO, true);
177     } else if exponent > MAX_NORMAL_EXP10 {
178         let float = F::from_bits(F::INFINITY_BITS);
179         return (float, true);
180     }
181 
182     // Normalize the mantissa, and calculate the bias.
183     let ctlz = mantissa.leading_zeros() as i32;
184     let mantissa = mantissa << ctlz;
185     let bias = F::EXPONENT_BIAS - F::MANTISSA_SIZE;
186 
187     // Need to convert our base 10 exponent to base 2, as an estimate.
188     // See module documentation for how we generated these magic numbers.
189     let unbiased_exp2 = (217706 * exponent as i64) >> 16;
190     let exp2 = unbiased_exp2 as i32 + (u64::FULL + bias) - ctlz;
191 
192     // Now need to get our extended, power of 10:
193     let (exp10_hi, exp10_lo) = POWERS_OF_10[(exponent - MIN_DENORMAL_EXP10) as usize];
194     let exp10_hi = exp10_hi;
195     let exp10_lo = exp10_lo;
196     let (mut x_hi, mut x_lo) = mul(mantissa, exp10_hi);
197 
198     // NOTE:
199     //  From here we make a few differences from the Lemire implementation,
200     //  to streamline integration with the slow path algorithm.
201     //
202     //  192-BIT
203     //  -------
204     //
205     //  When we check for halfway representations, for the extended
206     //  192-bit representation, we assume the following logic:
207     //  - If we have `x_hi & mask == mask` and wrapping behavior,
208     //      then we are close to a halfway representation, but 1-bit below.
209     //  - If `merged_hi & mask == mask` and `merged_lo + 1 == 0`, then
210     //      we are within 1-bit of the halfway representation.
211     //  In this case, we should add 1-bit, to get to the halfway
212     //  representation, and round-down, so we can get our `b` representation
213     //  to differentiate `b` from `b+u` near to `b+h`.
214     //
215     //  AFTER-SHIFTING
216     //  --------------
217     //
218     //  After shifting and checking for truncated bits, we have shifted
219     //  to the carry bit + 1. This means we are 2 bits above the hidden
220     //  bit, so we have a halfway representation if `mantissa & 3 == 1`,
221     //  and the truncated bits were 0 (`x_lo == 0` and `x_hi & mask == 0`).
222     //  Here, since we're at least at a halfway representation, round-down
223     //  so we get `b`. We're already **at least** at a halfway representation,
224     //  so we should not add any bits to the shifted mantissa.
225 
226     // Now need to check for a wider approximation.
227     let carry_size = F::MANTISSA_SIZE + 2;
228     let carry_shift = u64::FULL - carry_size - 1;
229     let mask = (1u64 << carry_shift) - 1;
230     if x_hi & mask == mask && x_lo.wrapping_add(mantissa) < mantissa {
231         let (y_hi, y_lo) = mul(mantissa, exp10_lo);
232         let mut merged_hi = x_hi;
233         let merged_lo = x_lo.wrapping_add(y_hi);
234         if merged_lo < x_lo {
235             merged_hi += 1;
236         }
237 
238         // Check for a halfway representation.
239         if merged_hi & mask == mask
240             && merged_lo.wrapping_add(1) == 0
241             && y_lo.wrapping_add(mantissa) < mantissa
242         {
243             // We don't actually care about the accuracy here,
244             // since we're going straight to the extended-float algorithm.
245             return (F::ZERO, false);
246         } else {
247             x_hi = merged_hi;
248             x_lo = merged_lo;
249         }
250     }
251 
252     // Shift to the carry bit (IE, mantissa size + 2).
253     let (mantissa, exp2) = shift_to_carry(x_hi, exp2, carry_shift);
254 
255     // Check for a halfway representation.
256     if x_lo == 0 && x_hi & mask == 0 && mantissa & 3 == 1 {
257         // We don't actually care about the accuracy here,
258         // since we're going straight to the extended-float algorithm.
259         return (F::ZERO, false);
260     }
261 
262     to_float(mantissa, exp2)
263 }
264 
265 /// Create a precise native float using the Eisel-Lemire algorithm.
266 ///
267 /// Note that the Eisel-Lemire algorithm may not be accurate if
268 /// truncated digits occur, so we do a second pass with the
269 /// mantissa + 1 (to solve any halfway issues with truncated
270 /// digits), and if the two values are the same, return true.
271 /// This avoids any costly error estimation, since if `mantissa`
272 /// `mantissa+1` are the same, we cannot have had a halfway case.
273 ///
274 /// Note that if we cannot determine a valid representation,
275 /// we fall back to the extended-float moderate path, so we can
276 /// get an accurate, base representation for big-integer
277 /// algorithms.
278 #[inline]
moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool) where F: Float,279 pub(super) fn moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool)
280 where
281     F: Float,
282 {
283     let (float, valid) = eisel_lemire(mantissa, exponent);
284     if valid {
285         if !truncated {
286             (float, true)
287         } else {
288             let mantissa_up = mantissa + 1;
289             let (float_up, valid) = eisel_lemire(mantissa_up, exponent);
290             if valid && float == float_up {
291                 (float, true)
292             } else {
293                 (float, false)
294             }
295         }
296     } else {
297         // If the first representation failed, try the extended-float
298         // algorithm, since it's a lot faster for small, denormal floats.
299         extended_float::moderate_path::<F>(mantissa, exponent, truncated)
300     }
301 }
302 
303 // TESTS
304 // -----
305 
306 #[cfg(test)]
307 mod tests {
308     use super::*;
309 
310     #[test]
test_halfway_round_down()311     fn test_halfway_round_down() {
312         // Check only Eisel-Lemire.
313         assert_eq!((9007199254740992.0, true), eisel_lemire::<f64>(9007199254740992, 0));
314         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740993, 0));
315         assert_eq!((9007199254740994.0, true), eisel_lemire::<f64>(9007199254740994, 0));
316         assert_eq!((9223372036854775808.0, true), eisel_lemire::<f64>(9223372036854775808, 0));
317         assert_eq!((0.0, false), eisel_lemire::<f64>(9223372036854776832, 0));
318         assert_eq!((9223372036854777856.0, true), eisel_lemire::<f64>(9223372036854777856, 0));
319 
320         // We can't get an accurate representation here.
321         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740992000, -3));
322         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740993000, -3));
323         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740994000, -3));
324 
325         // Check with the extended-float backup.
326         assert_eq!((9007199254740992.0, true), moderate_path::<f64>(9007199254740992, 0, false));
327         assert_eq!((9007199254740992.0, false), moderate_path::<f64>(9007199254740993, 0, false));
328         assert_eq!((9007199254740994.0, true), moderate_path::<f64>(9007199254740994, 0, false));
329         assert_eq!(
330             (9223372036854775808.0, true),
331             moderate_path::<f64>(9223372036854775808, 0, false)
332         );
333         assert_eq!(
334             (9223372036854775808.0, false),
335             moderate_path::<f64>(9223372036854776832, 0, false)
336         );
337         assert_eq!(
338             (9223372036854777856.0, true),
339             moderate_path::<f64>(9223372036854777856, 0, false)
340         );
341 
342         // We can't get an accurate from Lemire representation here.
343         assert_eq!(
344             (9007199254740992.0, true),
345             moderate_path::<f64>(9007199254740992000, -3, false)
346         );
347         assert_eq!(
348             (9007199254740992.0, false),
349             moderate_path::<f64>(9007199254740993000, -3, false)
350         );
351         assert_eq!(
352             (9007199254740994.0, true),
353             moderate_path::<f64>(9007199254740994000, -3, false)
354         );
355     }
356 
357     #[test]
test_halfway_round_up()358     fn test_halfway_round_up() {
359         // Check only Eisel-Lemire.
360         assert_eq!((9007199254740994.0, true), eisel_lemire::<f64>(9007199254740994, 0));
361         assert_eq!((9007199254740996.0, true), eisel_lemire::<f64>(9007199254740995, 0));
362         assert_eq!((9007199254740996.0, true), eisel_lemire::<f64>(9007199254740996, 0));
363         assert_eq!((18014398509481988.0, true), eisel_lemire::<f64>(18014398509481988, 0));
364         assert_eq!((18014398509481992.0, true), eisel_lemire::<f64>(18014398509481990, 0));
365         assert_eq!((18014398509481992.0, true), eisel_lemire::<f64>(18014398509481992, 0));
366         assert_eq!((9223372036854777856.0, true), eisel_lemire::<f64>(9223372036854777856, 0));
367         assert_eq!((9223372036854779904.0, true), eisel_lemire::<f64>(9223372036854778880, 0));
368         assert_eq!((9223372036854779904.0, true), eisel_lemire::<f64>(9223372036854779904, 0));
369 
370         // We can't get an accurate representation here.
371         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740994000, -3));
372         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740995000, -3));
373         assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740996000, -3));
374 
375         // Check with the extended-float backup.
376         assert_eq!((9007199254740994.0, true), moderate_path::<f64>(9007199254740994, 0, false));
377         assert_eq!((9007199254740996.0, true), moderate_path::<f64>(9007199254740995, 0, false));
378         assert_eq!((9007199254740996.0, true), moderate_path::<f64>(9007199254740996, 0, false));
379         assert_eq!((18014398509481988.0, true), moderate_path::<f64>(18014398509481988, 0, false));
380         assert_eq!((18014398509481992.0, true), moderate_path::<f64>(18014398509481990, 0, false));
381         assert_eq!((18014398509481992.0, true), moderate_path::<f64>(18014398509481992, 0, false));
382         assert_eq!(
383             (9223372036854777856.0, true),
384             moderate_path::<f64>(9223372036854777856, 0, false)
385         );
386         assert_eq!(
387             (9223372036854779904.0, true),
388             moderate_path::<f64>(9223372036854778880, 0, false)
389         );
390         assert_eq!(
391             (9223372036854779904.0, true),
392             moderate_path::<f64>(9223372036854779904, 0, false)
393         );
394 
395         // We can't get an accurate from Lemire representation here.
396         assert_eq!(
397             (9007199254740994.0, true),
398             moderate_path::<f64>(9007199254740994000, -3, false)
399         );
400         assert_eq!(
401             (9007199254740994.0, false),
402             moderate_path::<f64>(9007199254740995000, -3, false)
403         );
404         assert_eq!(
405             (9007199254740996.0, true),
406             moderate_path::<f64>(9007199254740996000, -3, false)
407         );
408     }
409 
410     #[test]
test_mul()411     fn test_mul() {
412         let e1 = 11529215046068469760; // 1e1
413         let e10 = 10737418240000000000; // 1e10
414         assert_eq!((0x5D21DBA000000000, 0x0000000000000000), mul(e1, e10));
415 
416         let e9 = 17179869184000000000; // 1e9
417         let e70 = 13363823550460978230; // 1e70
418         assert_eq!((0xACB92ED9397BF995, 0xA23A700000000000), mul(e9, e70));
419 
420         // e289
421         let e280 = 10162340898095201970; // 1e280
422         assert_eq!((0x83585D8FD9C25DB6, 0xFC31D00000000000), mul(e9, e280));
423 
424         // e290
425         let e0 = 9223372036854775808; // 1e0
426         let e290 = 11830521861667747109; // 1e290
427         assert_eq!((0x52173A79E8197A92, 0x8000000000000000), mul(e0, e290));
428     }
429 }
430