1 // Adapted from https://github.com/Alexhuszagh/rust-lexical.
2
3 //! Defines rounding schemes for floating-point numbers.
4
5 use super::float::ExtendedFloat;
6 use super::num::*;
7 use super::shift::*;
8 use core::mem;
9
10 // MASKS
11
12 /// Calculate a scalar factor of 2 above the halfway point.
13 #[inline]
nth_bit(n: u64) -> u6414 pub(crate) fn nth_bit(n: u64) -> u64 {
15 let bits: u64 = mem::size_of::<u64>() as u64 * 8;
16 debug_assert!(n < bits, "nth_bit() overflow in shl.");
17
18 1 << n
19 }
20
21 /// Generate a bitwise mask for the lower `n` bits.
22 #[inline]
lower_n_mask(n: u64) -> u6423 pub(crate) fn lower_n_mask(n: u64) -> u64 {
24 let bits: u64 = mem::size_of::<u64>() as u64 * 8;
25 debug_assert!(n <= bits, "lower_n_mask() overflow in shl.");
26
27 if n == bits {
28 u64::max_value()
29 } else {
30 (1 << n) - 1
31 }
32 }
33
34 /// Calculate the halfway point for the lower `n` bits.
35 #[inline]
lower_n_halfway(n: u64) -> u6436 pub(crate) fn lower_n_halfway(n: u64) -> u64 {
37 let bits: u64 = mem::size_of::<u64>() as u64 * 8;
38 debug_assert!(n <= bits, "lower_n_halfway() overflow in shl.");
39
40 if n == 0 {
41 0
42 } else {
43 nth_bit(n - 1)
44 }
45 }
46
47 /// Calculate a bitwise mask with `n` 1 bits starting at the `bit` position.
48 #[inline]
internal_n_mask(bit: u64, n: u64) -> u6449 pub(crate) fn internal_n_mask(bit: u64, n: u64) -> u64 {
50 let bits: u64 = mem::size_of::<u64>() as u64 * 8;
51 debug_assert!(bit <= bits, "internal_n_halfway() overflow in shl.");
52 debug_assert!(n <= bits, "internal_n_halfway() overflow in shl.");
53 debug_assert!(bit >= n, "internal_n_halfway() overflow in sub.");
54
55 lower_n_mask(bit) ^ lower_n_mask(bit - n)
56 }
57
58 // NEAREST ROUNDING
59
60 // Shift right N-bytes and round to the nearest.
61 //
62 // Return if we are above halfway and if we are halfway.
63 #[inline]
round_nearest(fp: &mut ExtendedFloat, shift: i32) -> (bool, bool)64 pub(crate) fn round_nearest(fp: &mut ExtendedFloat, shift: i32) -> (bool, bool) {
65 // Extract the truncated bits using mask.
66 // Calculate if the value of the truncated bits are either above
67 // the mid-way point, or equal to it.
68 //
69 // For example, for 4 truncated bytes, the mask would be b1111
70 // and the midway point would be b1000.
71 let mask: u64 = lower_n_mask(shift as u64);
72 let halfway: u64 = lower_n_halfway(shift as u64);
73
74 let truncated_bits = fp.mant & mask;
75 let is_above = truncated_bits > halfway;
76 let is_halfway = truncated_bits == halfway;
77
78 // Bit shift so the leading bit is in the hidden bit.
79 overflowing_shr(fp, shift);
80
81 (is_above, is_halfway)
82 }
83
84 // Tie rounded floating point to event.
85 #[inline]
tie_even(fp: &mut ExtendedFloat, is_above: bool, is_halfway: bool)86 pub(crate) fn tie_even(fp: &mut ExtendedFloat, is_above: bool, is_halfway: bool) {
87 // Extract the last bit after shifting (and determine if it is odd).
88 let is_odd = fp.mant & 1 == 1;
89
90 // Calculate if we need to roundup.
91 // We need to roundup if we are above halfway, or if we are odd
92 // and at half-way (need to tie-to-even).
93 if is_above || (is_odd && is_halfway) {
94 fp.mant += 1;
95 }
96 }
97
98 // Shift right N-bytes and round nearest, tie-to-even.
99 //
100 // Floating-point arithmetic uses round to nearest, ties to even,
101 // which rounds to the nearest value, if the value is halfway in between,
102 // round to an even value.
103 #[inline]
round_nearest_tie_even(fp: &mut ExtendedFloat, shift: i32)104 pub(crate) fn round_nearest_tie_even(fp: &mut ExtendedFloat, shift: i32) {
105 let (is_above, is_halfway) = round_nearest(fp, shift);
106 tie_even(fp, is_above, is_halfway);
107 }
108
109 // DIRECTED ROUNDING
110
111 // Shift right N-bytes and round towards a direction.
112 //
113 // Return if we have any truncated bytes.
114 #[inline]
round_toward(fp: &mut ExtendedFloat, shift: i32) -> bool115 fn round_toward(fp: &mut ExtendedFloat, shift: i32) -> bool {
116 let mask: u64 = lower_n_mask(shift as u64);
117 let truncated_bits = fp.mant & mask;
118
119 // Bit shift so the leading bit is in the hidden bit.
120 overflowing_shr(fp, shift);
121
122 truncated_bits != 0
123 }
124
125 // Round down.
126 #[inline]
downard(_: &mut ExtendedFloat, _: bool)127 fn downard(_: &mut ExtendedFloat, _: bool) {}
128
129 // Shift right N-bytes and round toward zero.
130 //
131 // Floating-point arithmetic defines round toward zero, which rounds
132 // towards positive zero.
133 #[inline]
round_downward(fp: &mut ExtendedFloat, shift: i32)134 pub(crate) fn round_downward(fp: &mut ExtendedFloat, shift: i32) {
135 // Bit shift so the leading bit is in the hidden bit.
136 // No rounding schemes, so we just ignore everything else.
137 let is_truncated = round_toward(fp, shift);
138 downard(fp, is_truncated);
139 }
140
141 // ROUND TO FLOAT
142
143 // Shift the ExtendedFloat fraction to the fraction bits in a native float.
144 //
145 // Floating-point arithmetic uses round to nearest, ties to even,
146 // which rounds to the nearest value, if the value is halfway in between,
147 // round to an even value.
148 #[inline]
round_to_float<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm) where F: Float, Algorithm: FnOnce(&mut ExtendedFloat, i32),149 pub(crate) fn round_to_float<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
150 where
151 F: Float,
152 Algorithm: FnOnce(&mut ExtendedFloat, i32),
153 {
154 // Calculate the difference to allow a single calculation
155 // rather than a loop, to minimize the number of ops required.
156 // This does underflow detection.
157 let final_exp = fp.exp + F::DEFAULT_SHIFT;
158 if final_exp < F::DENORMAL_EXPONENT {
159 // We would end up with a denormal exponent, try to round to more
160 // digits. Only shift right if we can avoid zeroing out the value,
161 // which requires the exponent diff to be < M::BITS. The value
162 // is already normalized, so we shouldn't have any issue zeroing
163 // out the value.
164 let diff = F::DENORMAL_EXPONENT - fp.exp;
165 if diff <= u64::FULL {
166 // We can avoid underflow, can get a valid representation.
167 algorithm(fp, diff);
168 } else {
169 // Certain underflow, assign literal 0s.
170 fp.mant = 0;
171 fp.exp = 0;
172 }
173 } else {
174 algorithm(fp, F::DEFAULT_SHIFT);
175 }
176
177 if fp.mant & F::CARRY_MASK == F::CARRY_MASK {
178 // Roundup carried over to 1 past the hidden bit.
179 shr(fp, 1);
180 }
181 }
182
183 // AVOID OVERFLOW/UNDERFLOW
184
185 // Avoid overflow for large values, shift left as needed.
186 //
187 // Shift until a 1-bit is in the hidden bit, if the mantissa is not 0.
188 #[inline]
avoid_overflow<F>(fp: &mut ExtendedFloat) where F: Float,189 pub(crate) fn avoid_overflow<F>(fp: &mut ExtendedFloat)
190 where
191 F: Float,
192 {
193 // Calculate the difference to allow a single calculation
194 // rather than a loop, minimizing the number of ops required.
195 if fp.exp >= F::MAX_EXPONENT {
196 let diff = fp.exp - F::MAX_EXPONENT;
197 if diff <= F::MANTISSA_SIZE {
198 // Our overflow mask needs to start at the hidden bit, or at
199 // `F::MANTISSA_SIZE+1`, and needs to have `diff+1` bits set,
200 // to see if our value overflows.
201 let bit = (F::MANTISSA_SIZE + 1) as u64;
202 let n = (diff + 1) as u64;
203 let mask = internal_n_mask(bit, n);
204 if (fp.mant & mask) == 0 {
205 // If we have no 1-bit in the hidden-bit position,
206 // which is index 0, we need to shift 1.
207 let shift = diff + 1;
208 shl(fp, shift);
209 }
210 }
211 }
212 }
213
214 // ROUND TO NATIVE
215
216 // Round an extended-precision float to a native float representation.
217 #[inline]
round_to_native<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm) where F: Float, Algorithm: FnOnce(&mut ExtendedFloat, i32),218 pub(crate) fn round_to_native<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
219 where
220 F: Float,
221 Algorithm: FnOnce(&mut ExtendedFloat, i32),
222 {
223 // Shift all the way left, to ensure a consistent representation.
224 // The following right-shifts do not work for a non-normalized number.
225 fp.normalize();
226
227 // Round so the fraction is in a native mantissa representation,
228 // and avoid overflow/underflow.
229 round_to_float::<F, _>(fp, algorithm);
230 avoid_overflow::<F>(fp);
231 }
232