1 //! Fast estimation of the accurate representation of a float.
2 //!
3 //! Based off the Golang implementation of the Eisel-Lemire algorithm,
4 //! found here:
5 //! https://github.com/golang/go/blob/2ebe77a2fda1ee9ff6fd9a3e08933ad1ebaea039/src/strconv/eisel_lemire.go
6 //!
7 //! Which, itself was based off of the Wuff's implementation:
8 //! https://github.com/google/wuffs/blob/ba3818cb6b473a2ed0b38ecfc07dbbd3a97e8ae7/internal/cgen/base/floatconv-submodule-code.c
9 //!
10 //! The original algorithm may be found here:
11 //! https://github.com/lemire/fast_double_parser
12 //!
13 //! And an in-depth blogpost describing the algorithms may be found here:
14 //! https://nigeltao.github.io/blog/2020/eisel-lemire.html
15 //!
16 //! # Magic Number Generation
17 //!
18 //! ```python
19 //! import math
20 //!
21 //! def get_range(max_exp, bitshift):
22 //! den = 1 << bitshift
23 //! num = int(math.ceil(math.log2(10) * den))
24 //! for exp10 in range(0, max_exp):
25 //! exp2_exact = int(math.log2(10**exp10))
26 //! exp2_guess = num * exp10 // den
27 //! if exp2_exact != exp2_guess:
28 //! raise ValueError(f'{exp10}')
29 //! return num, den
30 //! ```
31 //!
32 //! For 64-bit and smaller floats, we therefore need a bitshift of 16,
33 //! so our magic number is `217706`. For 128-bit floats, we need a bitshift
34 //! of >= 25, so we round up to 32, and therefore need a magic number
35 //! of `14267572528`. Note that due to the storage requirements,
36 //! 128-bit floats do not currently use this algorithm.
37
38 use super::extended_float;
39 use super::num::*;
40 use super::powers::*;
41
42 // MUL
43 // ---
44
45 /// Multiply two unsigned, integral values, and return the hi and lo product.
46 #[inline(always)]
mul(x: u64, y: u64) -> (u64, u64)47 pub(crate) fn mul(x: u64, y: u64) -> (u64, u64) {
48 // Extract high-and-low masks.
49 let x1 = x >> u64::HALF;
50 let x0 = x & u64::LOMASK;
51 let y1 = y >> u64::HALF;
52 let y0 = y & u64::LOMASK;
53
54 // Get our products
55 let w0 = x0 * y0;
56 let tmp = (x1 * y0) + (w0 >> u64::HALF);
57 let w1 = tmp & u64::LOMASK;
58 let w2 = tmp >> u64::HALF;
59 let w1 = w1 + x0 * y1;
60 let hi = (x1 * y1) + w2 + (w1 >> u64::HALF);
61 let lo = x.wrapping_mul(y);
62
63 (hi, lo)
64 }
65
66 // SHIFT
67 // -----
68
69 /// Shift significant digits to at most the carry bit
70 /// The carry bit is 1 above the hidden bit, in the exponent,
71 /// or mantissa size + 2.
72 #[inline(always)]
shift_to_carry(x_hi: u64, exp2: i32, carry_shift: i32) -> (u64, i32)73 fn shift_to_carry(x_hi: u64, exp2: i32, carry_shift: i32) -> (u64, i32) {
74 // Carry out the shift
75 let msb_shift = u64::FULL - 1;
76 let msb = x_hi >> msb_shift;
77 let shift = msb.as_i32() + carry_shift;
78 let mantissa = x_hi >> shift;
79 let exp2 = exp2 - (1i32 ^ msb.as_i32());
80
81 (mantissa, exp2)
82 }
83
84 // TO FLOAT
85 // --------
86
87 /// Convert mantissa and binary exponent to floating-point representation.
88 ///
89 /// This function expects the following things:
90 /// 1). The highest mantissa bit set is 1 above the carry bit.
91 /// 2). The lowest mantissa bit set is the carry bit.
92 /// That is, 2 above the hidden bit, or 1 above the hidden bit.
93 /// 3). The binary exponent is adjusted for the exponent bias.
94 #[inline(always)]
to_float<F>(mantissa: u64, exp: i32) -> (F, bool) where F: Float,95 fn to_float<F>(mantissa: u64, exp: i32) -> (F, bool)
96 where
97 F: Float,
98 {
99 // Check denormal values for underflow.
100 if exp <= -(F::MANTISSA_SIZE + 2) {
101 // Have a literal zero. If we shift the bits, we'll get 0.
102 return (F::ZERO, true);
103 } else if exp <= 0 {
104 // We don't actually care about the accuracy here,
105 // since we're going straight to the extended-float algorithm.
106 return (F::ZERO, false);
107 }
108
109 // Get our raw bits.
110 let mut exp = F::Unsigned::as_cast(exp);
111 let mut mantissa = F::Unsigned::as_cast(mantissa);
112
113 // Round-nearest, tie-even.
114 let zero = F::Unsigned::ZERO;
115 let one = F::Unsigned::as_cast(1);
116 mantissa += mantissa & one;
117
118 // Shift them into position.
119 mantissa >>= 1i32;
120 let precision = F::MANTISSA_SIZE + 1;
121 if mantissa >> precision > zero {
122 mantissa >>= 1i32;
123 exp += one;
124 }
125
126 // Check our mantissa representation is valid, that is,
127 // we didn't have a bit mantissa or hidden bit set.
128 let mask = F::MANTISSA_MASK | F::HIDDEN_BIT_MASK;
129 debug_assert!(mantissa & mask == mantissa);
130
131 // Check for overflow, if so, return a literal infinity.
132 let max_exp = F::MAX_EXPONENT + F::EXPONENT_BIAS;
133 if exp >= F::Unsigned::as_cast(max_exp) {
134 let float = F::from_bits(F::INFINITY_BITS);
135 return (float, true);
136 }
137
138 // Should fail, we shouldn't have any exponent bits set.
139 mantissa &= F::MANTISSA_MASK;
140 exp <<= F::MANTISSA_SIZE;
141 let bits = exp | mantissa;
142
143 (F::from_bits(bits), true)
144 }
145
146 // EISEL-LEMIRE
147 // ------------
148
149 /// Create a precise native float using the Eisel-Lemire algorithm.
150 ///
151 /// NOTE: If the Eisel-Lamire algorithm cannot differentiate a halfway
152 /// representation, it cannot determine whether to round up or down
153 /// to determine the correct `b` value for big-float determination.
154 ///
155 /// In that case, we fall back to extended-float to determine that
156 /// representation.
157 #[inline]
eisel_lemire<F>(mantissa: u64, exponent: i32) -> (F, bool) where F: Float,158 pub(super) fn eisel_lemire<F>(mantissa: u64, exponent: i32) -> (F, bool)
159 where
160 F: Float,
161 {
162 // Check if the value is outside of our max range:
163 // If the value is above our max range, we have to have an infinity,
164 // and we have an exact representation (1e348) is infinity, which
165 // is the minimum possible value above this range.
166 //
167 // For negative values, we're guaranteed to have 0 as well:
168 // with 2470328229206232720e-342, we round to 0, while with
169 // 2470328229206232721e-342, we round up to 5e-324. Both of these
170 // contain the maximum number of mantissa digits (19), so our
171 // base-10 exponent cannot get any lower.
172 //
173 // Note that this only checks beyond the limits of f64, we do
174 // checks for narrower types further in.
175 if exponent < MIN_DENORMAL_EXP10 {
176 return (F::ZERO, true);
177 } else if exponent > MAX_NORMAL_EXP10 {
178 let float = F::from_bits(F::INFINITY_BITS);
179 return (float, true);
180 }
181
182 // Normalize the mantissa, and calculate the bias.
183 let ctlz = mantissa.leading_zeros() as i32;
184 let mantissa = mantissa << ctlz;
185 let bias = F::EXPONENT_BIAS - F::MANTISSA_SIZE;
186
187 // Need to convert our base 10 exponent to base 2, as an estimate.
188 // See module documentation for how we generated these magic numbers.
189 let unbiased_exp2 = (217706 * exponent as i64) >> 16;
190 let exp2 = unbiased_exp2 as i32 + (u64::FULL + bias) - ctlz;
191
192 // Now need to get our extended, power of 10:
193 let (exp10_hi, exp10_lo) = POWERS_OF_10[(exponent - MIN_DENORMAL_EXP10) as usize];
194 let exp10_hi = exp10_hi;
195 let exp10_lo = exp10_lo;
196 let (mut x_hi, mut x_lo) = mul(mantissa, exp10_hi);
197
198 // NOTE:
199 // From here we make a few differences from the Lemire implementation,
200 // to streamline integration with the slow path algorithm.
201 //
202 // 192-BIT
203 // -------
204 //
205 // When we check for halfway representations, for the extended
206 // 192-bit representation, we assume the following logic:
207 // - If we have `x_hi & mask == mask` and wrapping behavior,
208 // then we are close to a halfway representation, but 1-bit below.
209 // - If `merged_hi & mask == mask` and `merged_lo + 1 == 0`, then
210 // we are within 1-bit of the halfway representation.
211 // In this case, we should add 1-bit, to get to the halfway
212 // representation, and round-down, so we can get our `b` representation
213 // to differentiate `b` from `b+u` near to `b+h`.
214 //
215 // AFTER-SHIFTING
216 // --------------
217 //
218 // After shifting and checking for truncated bits, we have shifted
219 // to the carry bit + 1. This means we are 2 bits above the hidden
220 // bit, so we have a halfway representation if `mantissa & 3 == 1`,
221 // and the truncated bits were 0 (`x_lo == 0` and `x_hi & mask == 0`).
222 // Here, since we're at least at a halfway representation, round-down
223 // so we get `b`. We're already **at least** at a halfway representation,
224 // so we should not add any bits to the shifted mantissa.
225
226 // Now need to check for a wider approximation.
227 let carry_size = F::MANTISSA_SIZE + 2;
228 let carry_shift = u64::FULL - carry_size - 1;
229 let mask = (1u64 << carry_shift) - 1;
230 if x_hi & mask == mask && x_lo.wrapping_add(mantissa) < mantissa {
231 let (y_hi, y_lo) = mul(mantissa, exp10_lo);
232 let mut merged_hi = x_hi;
233 let merged_lo = x_lo.wrapping_add(y_hi);
234 if merged_lo < x_lo {
235 merged_hi += 1;
236 }
237
238 // Check for a halfway representation.
239 if merged_hi & mask == mask
240 && merged_lo.wrapping_add(1) == 0
241 && y_lo.wrapping_add(mantissa) < mantissa
242 {
243 // We don't actually care about the accuracy here,
244 // since we're going straight to the extended-float algorithm.
245 return (F::ZERO, false);
246 } else {
247 x_hi = merged_hi;
248 x_lo = merged_lo;
249 }
250 }
251
252 // Shift to the carry bit (IE, mantissa size + 2).
253 let (mantissa, exp2) = shift_to_carry(x_hi, exp2, carry_shift);
254
255 // Check for a halfway representation.
256 if x_lo == 0 && x_hi & mask == 0 && mantissa & 3 == 1 {
257 // We don't actually care about the accuracy here,
258 // since we're going straight to the extended-float algorithm.
259 return (F::ZERO, false);
260 }
261
262 to_float(mantissa, exp2)
263 }
264
265 /// Create a precise native float using the Eisel-Lemire algorithm.
266 ///
267 /// Note that the Eisel-Lemire algorithm may not be accurate if
268 /// truncated digits occur, so we do a second pass with the
269 /// mantissa + 1 (to solve any halfway issues with truncated
270 /// digits), and if the two values are the same, return true.
271 /// This avoids any costly error estimation, since if `mantissa`
272 /// `mantissa+1` are the same, we cannot have had a halfway case.
273 ///
274 /// Note that if we cannot determine a valid representation,
275 /// we fall back to the extended-float moderate path, so we can
276 /// get an accurate, base representation for big-integer
277 /// algorithms.
278 #[inline]
moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool) where F: Float,279 pub(super) fn moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool)
280 where
281 F: Float,
282 {
283 let (float, valid) = eisel_lemire(mantissa, exponent);
284 if valid {
285 if !truncated {
286 (float, true)
287 } else {
288 let mantissa_up = mantissa + 1;
289 let (float_up, valid) = eisel_lemire(mantissa_up, exponent);
290 if valid && float == float_up {
291 (float, true)
292 } else {
293 (float, false)
294 }
295 }
296 } else {
297 // If the first representation failed, try the extended-float
298 // algorithm, since it's a lot faster for small, denormal floats.
299 extended_float::moderate_path::<F>(mantissa, exponent, truncated)
300 }
301 }
302
303 // TESTS
304 // -----
305
306 #[cfg(test)]
307 mod tests {
308 use super::*;
309
310 #[test]
test_halfway_round_down()311 fn test_halfway_round_down() {
312 // Check only Eisel-Lemire.
313 assert_eq!((9007199254740992.0, true), eisel_lemire::<f64>(9007199254740992, 0));
314 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740993, 0));
315 assert_eq!((9007199254740994.0, true), eisel_lemire::<f64>(9007199254740994, 0));
316 assert_eq!((9223372036854775808.0, true), eisel_lemire::<f64>(9223372036854775808, 0));
317 assert_eq!((0.0, false), eisel_lemire::<f64>(9223372036854776832, 0));
318 assert_eq!((9223372036854777856.0, true), eisel_lemire::<f64>(9223372036854777856, 0));
319
320 // We can't get an accurate representation here.
321 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740992000, -3));
322 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740993000, -3));
323 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740994000, -3));
324
325 // Check with the extended-float backup.
326 assert_eq!((9007199254740992.0, true), moderate_path::<f64>(9007199254740992, 0, false));
327 assert_eq!((9007199254740992.0, false), moderate_path::<f64>(9007199254740993, 0, false));
328 assert_eq!((9007199254740994.0, true), moderate_path::<f64>(9007199254740994, 0, false));
329 assert_eq!(
330 (9223372036854775808.0, true),
331 moderate_path::<f64>(9223372036854775808, 0, false)
332 );
333 assert_eq!(
334 (9223372036854775808.0, false),
335 moderate_path::<f64>(9223372036854776832, 0, false)
336 );
337 assert_eq!(
338 (9223372036854777856.0, true),
339 moderate_path::<f64>(9223372036854777856, 0, false)
340 );
341
342 // We can't get an accurate from Lemire representation here.
343 assert_eq!(
344 (9007199254740992.0, true),
345 moderate_path::<f64>(9007199254740992000, -3, false)
346 );
347 assert_eq!(
348 (9007199254740992.0, false),
349 moderate_path::<f64>(9007199254740993000, -3, false)
350 );
351 assert_eq!(
352 (9007199254740994.0, true),
353 moderate_path::<f64>(9007199254740994000, -3, false)
354 );
355 }
356
357 #[test]
test_halfway_round_up()358 fn test_halfway_round_up() {
359 // Check only Eisel-Lemire.
360 assert_eq!((9007199254740994.0, true), eisel_lemire::<f64>(9007199254740994, 0));
361 assert_eq!((9007199254740996.0, true), eisel_lemire::<f64>(9007199254740995, 0));
362 assert_eq!((9007199254740996.0, true), eisel_lemire::<f64>(9007199254740996, 0));
363 assert_eq!((18014398509481988.0, true), eisel_lemire::<f64>(18014398509481988, 0));
364 assert_eq!((18014398509481992.0, true), eisel_lemire::<f64>(18014398509481990, 0));
365 assert_eq!((18014398509481992.0, true), eisel_lemire::<f64>(18014398509481992, 0));
366 assert_eq!((9223372036854777856.0, true), eisel_lemire::<f64>(9223372036854777856, 0));
367 assert_eq!((9223372036854779904.0, true), eisel_lemire::<f64>(9223372036854778880, 0));
368 assert_eq!((9223372036854779904.0, true), eisel_lemire::<f64>(9223372036854779904, 0));
369
370 // We can't get an accurate representation here.
371 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740994000, -3));
372 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740995000, -3));
373 assert_eq!((0.0, false), eisel_lemire::<f64>(9007199254740996000, -3));
374
375 // Check with the extended-float backup.
376 assert_eq!((9007199254740994.0, true), moderate_path::<f64>(9007199254740994, 0, false));
377 assert_eq!((9007199254740996.0, true), moderate_path::<f64>(9007199254740995, 0, false));
378 assert_eq!((9007199254740996.0, true), moderate_path::<f64>(9007199254740996, 0, false));
379 assert_eq!((18014398509481988.0, true), moderate_path::<f64>(18014398509481988, 0, false));
380 assert_eq!((18014398509481992.0, true), moderate_path::<f64>(18014398509481990, 0, false));
381 assert_eq!((18014398509481992.0, true), moderate_path::<f64>(18014398509481992, 0, false));
382 assert_eq!(
383 (9223372036854777856.0, true),
384 moderate_path::<f64>(9223372036854777856, 0, false)
385 );
386 assert_eq!(
387 (9223372036854779904.0, true),
388 moderate_path::<f64>(9223372036854778880, 0, false)
389 );
390 assert_eq!(
391 (9223372036854779904.0, true),
392 moderate_path::<f64>(9223372036854779904, 0, false)
393 );
394
395 // We can't get an accurate from Lemire representation here.
396 assert_eq!(
397 (9007199254740994.0, true),
398 moderate_path::<f64>(9007199254740994000, -3, false)
399 );
400 assert_eq!(
401 (9007199254740994.0, false),
402 moderate_path::<f64>(9007199254740995000, -3, false)
403 );
404 assert_eq!(
405 (9007199254740996.0, true),
406 moderate_path::<f64>(9007199254740996000, -3, false)
407 );
408 }
409
410 #[test]
test_mul()411 fn test_mul() {
412 let e1 = 11529215046068469760; // 1e1
413 let e10 = 10737418240000000000; // 1e10
414 assert_eq!((0x5D21DBA000000000, 0x0000000000000000), mul(e1, e10));
415
416 let e9 = 17179869184000000000; // 1e9
417 let e70 = 13363823550460978230; // 1e70
418 assert_eq!((0xACB92ED9397BF995, 0xA23A700000000000), mul(e9, e70));
419
420 // e289
421 let e280 = 10162340898095201970; // 1e280
422 assert_eq!((0x83585D8FD9C25DB6, 0xFC31D00000000000), mul(e9, e280));
423
424 // e290
425 let e0 = 9223372036854775808; // 1e0
426 let e290 = 11830521861667747109; // 1e290
427 assert_eq!((0x52173A79E8197A92, 0x8000000000000000), mul(e0, e290));
428 }
429 }
430