1 //! Estimate the error in an 80-bit approximation of a float.
2 //!
3 //! This estimates the error in a floating-point representation.
4 //!
5 //! This implementation is loosely based off the Golang implementation,
6 //! found here:
7 //!     https://golang.org/src/strconv/atof.go
8 
9 use super::float::*;
10 use super::num::*;
11 use super::powers::*;
12 use super::rounding::*;
13 
14 // ERRORS
15 // ------
16 
17 /// Check if the error is accurate with a round-nearest rounding scheme.
18 #[inline]
nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool19 fn nearest_error_is_accurate(errors: u64, fp: &ExtendedFloat, extrabits: u64) -> bool {
20     // Round-to-nearest, need to use the halfway point.
21     if extrabits == 65 {
22         // Underflow, we have a shift larger than the mantissa.
23         // Representation is valid **only** if the value is close enough
24         // overflow to the next bit within errors. If it overflows,
25         // the representation is **not** valid.
26         !fp.mant.overflowing_add(errors).1
27     } else {
28         let mask: u64 = lower_n_mask(extrabits);
29         let extra: u64 = fp.mant & mask;
30 
31         // Round-to-nearest, need to check if we're close to halfway.
32         // IE, b10100 | 100000, where `|` signifies the truncation point.
33         let halfway: u64 = lower_n_halfway(extrabits);
34         let cmp1 = halfway.wrapping_sub(errors) < extra;
35         let cmp2 = extra < halfway.wrapping_add(errors);
36 
37         // If both comparisons are true, we have significant rounding error,
38         // and the value cannot be exactly represented. Otherwise, the
39         // representation is valid.
40         !(cmp1 && cmp2)
41     }
42 }
43 
44 #[inline(always)]
error_scale() -> u3245 fn error_scale() -> u32 {
46     8
47 }
48 
49 #[inline(always)]
error_halfscale() -> u3250 fn error_halfscale() -> u32 {
51     error_scale() / 2
52 }
53 
54 #[inline]
error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool55 fn error_is_accurate<F: Float>(count: u32, fp: &ExtendedFloat) -> bool {
56     // Determine if extended-precision float is a good approximation.
57     // If the error has affected too many units, the float will be
58     // inaccurate, or if the representation is too close to halfway
59     // that any operations could affect this halfway representation.
60     // See the documentation for dtoa for more information.
61     let bias = -(F::EXPONENT_BIAS - F::MANTISSA_SIZE);
62     let denormal_exp = bias - 63;
63     // This is always a valid u32, since (denormal_exp - fp.exp)
64     // will always be positive and the significand size is {23, 52}.
65     let extrabits = match fp.exp <= denormal_exp {
66         true => 64 - F::MANTISSA_SIZE + denormal_exp - fp.exp,
67         false => 63 - F::MANTISSA_SIZE,
68     };
69 
70     // Our logic is as follows: we want to determine if the actual
71     // mantissa and the errors during calculation differ significantly
72     // from the rounding point. The rounding point for round-nearest
73     // is the halfway point, IE, this when the truncated bits start
74     // with b1000..., while the rounding point for the round-toward
75     // is when the truncated bits are equal to 0.
76     // To do so, we can check whether the rounding point +/- the error
77     // are >/< the actual lower n bits.
78     //
79     // For whether we need to use signed or unsigned types for this
80     // analysis, see this example, using u8 rather than u64 to simplify
81     // things.
82     //
83     // # Comparisons
84     //      cmp1 = (halfway - errors) < extra
85     //      cmp1 = extra < (halfway + errors)
86     //
87     // # Large Extrabits, Low Errors
88     //
89     //      extrabits = 8
90     //      halfway          =  0b10000000
91     //      extra            =  0b10000010
92     //      errors           =  0b00000100
93     //      halfway - errors =  0b01111100
94     //      halfway + errors =  0b10000100
95     //
96     //      Unsigned:
97     //          halfway - errors = 124
98     //          halfway + errors = 132
99     //          extra            = 130
100     //          cmp1             = true
101     //          cmp2             = true
102     //      Signed:
103     //          halfway - errors = 124
104     //          halfway + errors = -124
105     //          extra            = -126
106     //          cmp1             = false
107     //          cmp2             = true
108     //
109     // # Conclusion
110     //
111     // Since errors will always be small, and since we want to detect
112     // if the representation is accurate, we need to use an **unsigned**
113     // type for comparisons.
114 
115     let extrabits = extrabits as u64;
116     let errors = count as u64;
117     if extrabits > 65 {
118         // Underflow, we have a literal 0.
119         return true;
120     }
121 
122     nearest_error_is_accurate(errors, fp, extrabits)
123 }
124 
125 // MODERATE PATH
126 // -------------
127 
128 /// Multiply the floating-point by the exponent.
129 ///
130 /// Multiply by pre-calculated powers of the base, modify the extended-
131 /// float, and return if new value and if the value can be represented
132 /// accurately.
multiply_exponent_extended<F>(fp: &mut ExtendedFloat, exponent: i32, truncated: bool) -> bool where F: Float,133 fn multiply_exponent_extended<F>(fp: &mut ExtendedFloat, exponent: i32, truncated: bool) -> bool
134 where
135     F: Float,
136 {
137     if exponent < MIN_DENORMAL_EXP10 {
138         // Guaranteed underflow (assign 0).
139         fp.mant = 0;
140         true
141     } else if exponent > MAX_NORMAL_EXP10 {
142         // Overflow (assign infinity)
143         fp.mant = 1 << 63;
144         fp.exp = 0x7FF;
145         true
146     } else {
147         // Within the valid exponent range, multiply by the large and small
148         // exponents and return the resulting value.
149 
150         // Track errors to as a factor of unit in last-precision.
151         let mut errors: u32 = 0;
152         if truncated {
153             errors += error_halfscale();
154         }
155 
156         // Infer the binary exponent from the power of 10.
157         // Adjust this exponent to the fact the value is normalized (1<<63).
158         let exp = -63 + (217706 * exponent as i64 >> 16);
159         let mant = POWERS_OF_10[(exponent - MIN_DENORMAL_EXP10) as usize].0;
160         let large = ExtendedFloat { mant, exp: exp as i32 };
161 
162         // Normalize fp and multiple by large.
163         fp.normalize();
164         fp.imul(&large);
165         if errors > 0 {
166             errors += 1;
167         }
168         errors += error_halfscale();
169 
170         // Normalize the floating point (and the errors).
171         let shift = fp.normalize();
172         errors <<= shift;
173 
174         error_is_accurate::<F>(errors, &fp)
175     }
176 }
177 
178 /// Create a precise native float using an intermediate extended-precision float.
179 ///
180 /// Return the float approximation and if the value can be accurately
181 /// represented with mantissa bits of precision.
182 #[inline]
moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool) where F: Float,183 pub(super) fn moderate_path<F>(mantissa: u64, exponent: i32, truncated: bool) -> (F, bool)
184 where
185     F: Float,
186 {
187     let mut fp = ExtendedFloat {
188         mant: mantissa,
189         exp: 0,
190     };
191     let valid = multiply_exponent_extended::<F>(&mut fp, exponent, truncated);
192     if valid {
193         let float = fp.into_float::<F>();
194         (float, true)
195     } else {
196         // Need the slow-path algorithm.
197         let float = fp.into_downward_float::<F>();
198         (float, false)
199     }
200 }
201 
202 // TESTS
203 // -----
204 
205 #[cfg(test)]
206 mod tests {
207     use super::*;
208 
209     #[test]
moderate_path_test()210     fn moderate_path_test() {
211         let (f, valid) = moderate_path::<f64>(1234567890, -1, false);
212         assert!(valid, "should be valid");
213         assert_eq!(f, 123456789.0);
214 
215         let (f, valid) = moderate_path::<f64>(1234567891, -1, false);
216         assert!(valid, "should be valid");
217         assert_eq!(f, 123456789.1);
218 
219         let (f, valid) = moderate_path::<f64>(12345678912, -2, false);
220         assert!(valid, "should be valid");
221         assert_eq!(f, 123456789.12);
222 
223         let (f, valid) = moderate_path::<f64>(123456789123, -3, false);
224         assert!(valid, "should be valid");
225         assert_eq!(f, 123456789.123);
226 
227         let (f, valid) = moderate_path::<f64>(1234567891234, -4, false);
228         assert!(valid, "should be valid");
229         assert_eq!(f, 123456789.1234);
230 
231         let (f, valid) = moderate_path::<f64>(12345678912345, -5, false);
232         assert!(valid, "should be valid");
233         assert_eq!(f, 123456789.12345);
234 
235         let (f, valid) = moderate_path::<f64>(123456789123456, -6, false);
236         assert!(valid, "should be valid");
237         assert_eq!(f, 123456789.123456);
238 
239         let (f, valid) = moderate_path::<f64>(1234567891234567, -7, false);
240         assert!(valid, "should be valid");
241         assert_eq!(f, 123456789.1234567);
242 
243         let (f, valid) = moderate_path::<f64>(12345678912345679, -8, false);
244         assert!(valid, "should be valid");
245         assert_eq!(f, 123456789.12345679);
246 
247         let (f, valid) = moderate_path::<f64>(4628372940652459, -17, false);
248         assert!(valid, "should be valid");
249         assert_eq!(f, 0.04628372940652459);
250 
251         let (f, valid) = moderate_path::<f64>(26383446160308229, -272, false);
252         assert!(valid, "should be valid");
253         assert_eq!(f, 2.6383446160308229e-256);
254 
255         let (_, valid) = moderate_path::<f64>(26383446160308230, -272, false);
256         assert!(!valid, "should be invalid");
257     }
258 }
259