src/lexical/rounding.rs

// Adapted from https://github.com/Alexhuszagh/rust-lexical.

//! Defines rounding schemes for floating-point numbers.

use super::float::ExtendedFloat;
use super::num::*;
use super::shift::*;
use crate::lib::mem;

// MASKS

/// Calculate a scalar factor of 2 above the halfway point.
#[inline]
pub(crate) fn nth_bit(n: u64) -> u64 {
    let bits: u64 = mem::size_of::<u64>() as u64 * 8;
    debug_assert!(n < bits, "nth_bit() overflow in shl.");

    1 << n
}

/// Generate a bitwise mask for the lower `n` bits.
#[inline]
pub(crate) fn lower_n_mask(n: u64) -> u64 {
    let bits: u64 = mem::size_of::<u64>() as u64 * 8;
    debug_assert!(n <= bits, "lower_n_mask() overflow in shl.");

    if n == bits {
        u64::max_value()
    } else {
        (1 << n) - 1
    }
}

/// Calculate the halfway point for the lower `n` bits.
#[inline]
pub(crate) fn lower_n_halfway(n: u64) -> u64 {
    let bits: u64 = mem::size_of::<u64>() as u64 * 8;
    debug_assert!(n <= bits, "lower_n_halfway() overflow in shl.");

    if n == 0 {
        0
    } else {
        nth_bit(n - 1)
    }
}

/// Calculate a bitwise mask with `n` 1 bits starting at the `bit` position.
#[inline]
pub(crate) fn internal_n_mask(bit: u64, n: u64) -> u64 {
    let bits: u64 = mem::size_of::<u64>() as u64 * 8;
    debug_assert!(bit <= bits, "internal_n_halfway() overflow in shl.");
    debug_assert!(n <= bits, "internal_n_halfway() overflow in shl.");
    debug_assert!(bit >= n, "internal_n_halfway() overflow in sub.");

    lower_n_mask(bit) ^ lower_n_mask(bit - n)
}

// NEAREST ROUNDING

// Shift right N-bytes and round to the nearest.
//
// Return if we are above halfway and if we are halfway.
#[inline]
pub(crate) fn round_nearest(fp: &mut ExtendedFloat, shift: i32) -> (bool, bool) {
    // Extract the truncated bits using mask.
    // Calculate if the value of the truncated bits are either above
    // the mid-way point, or equal to it.
    //
    // For example, for 4 truncated bytes, the mask would be b1111
    // and the midway point would be b1000.
    let mask: u64 = lower_n_mask(shift as u64);
    let halfway: u64 = lower_n_halfway(shift as u64);

    let truncated_bits = fp.mant & mask;
    let is_above = truncated_bits > halfway;
    let is_halfway = truncated_bits == halfway;

    // Bit shift so the leading bit is in the hidden bit.
    overflowing_shr(fp, shift);

    (is_above, is_halfway)
}

// Tie rounded floating point to event.
#[inline]
pub(crate) fn tie_even(fp: &mut ExtendedFloat, is_above: bool, is_halfway: bool) {
    // Extract the last bit after shifting (and determine if it is odd).
    let is_odd = fp.mant & 1 == 1;

    // Calculate if we need to roundup.
    // We need to roundup if we are above halfway, or if we are odd
    // and at half-way (need to tie-to-even).
    if is_above || (is_odd && is_halfway) {
        fp.mant += 1;
    }
}

// Shift right N-bytes and round nearest, tie-to-even.
//
// Floating-point arithmetic uses round to nearest, ties to even,
// which rounds to the nearest value, if the value is halfway in between,
// round to an even value.
#[inline]
pub(crate) fn round_nearest_tie_even(fp: &mut ExtendedFloat, shift: i32) {
    let (is_above, is_halfway) = round_nearest(fp, shift);
    tie_even(fp, is_above, is_halfway);
}

// DIRECTED ROUNDING

// Shift right N-bytes and round towards a direction.
//
// Return if we have any truncated bytes.
#[inline]
fn round_toward(fp: &mut ExtendedFloat, shift: i32) -> bool {
    let mask: u64 = lower_n_mask(shift as u64);
    let truncated_bits = fp.mant & mask;

    // Bit shift so the leading bit is in the hidden bit.
    overflowing_shr(fp, shift);

    truncated_bits != 0
}

// Round down.
#[inline]
fn downard(_: &mut ExtendedFloat, _: bool) {}

// Shift right N-bytes and round toward zero.
//
// Floating-point arithmetic defines round toward zero, which rounds
// towards positive zero.
#[inline]
pub(crate) fn round_downward(fp: &mut ExtendedFloat, shift: i32) {
    // Bit shift so the leading bit is in the hidden bit.
    // No rounding schemes, so we just ignore everything else.
    let is_truncated = round_toward(fp, shift);
    downard(fp, is_truncated);
}

// ROUND TO FLOAT

// Shift the ExtendedFloat fraction to the fraction bits in a native float.
//
// Floating-point arithmetic uses round to nearest, ties to even,
// which rounds to the nearest value, if the value is halfway in between,
// round to an even value.
#[inline]
pub(crate) fn round_to_float<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
where
    F: Float,
    Algorithm: FnOnce(&mut ExtendedFloat, i32),
{
    // Calculate the difference to allow a single calculation
    // rather than a loop, to minimize the number of ops required.
    // This does underflow detection.
    let final_exp = fp.exp + F::DEFAULT_SHIFT;
    if final_exp < F::DENORMAL_EXPONENT {
        // We would end up with a denormal exponent, try to round to more
        // digits. Only shift right if we can avoid zeroing out the value,
        // which requires the exponent diff to be < M::BITS. The value
        // is already normalized, so we shouldn't have any issue zeroing
        // out the value.
        let diff = F::DENORMAL_EXPONENT - fp.exp;
        if diff <= u64::FULL {
            // We can avoid underflow, can get a valid representation.
            algorithm(fp, diff);
        } else {
            // Certain underflow, assign literal 0s.
            fp.mant = 0;
            fp.exp = 0;
        }
    } else {
        algorithm(fp, F::DEFAULT_SHIFT);
    }

    if fp.mant & F::CARRY_MASK == F::CARRY_MASK {
        // Roundup carried over to 1 past the hidden bit.
        shr(fp, 1);
    }
}

// AVOID OVERFLOW/UNDERFLOW

// Avoid overflow for large values, shift left as needed.
//
// Shift until a 1-bit is in the hidden bit, if the mantissa is not 0.
#[inline]
pub(crate) fn avoid_overflow<F>(fp: &mut ExtendedFloat)
where
    F: Float,
{
    // Calculate the difference to allow a single calculation
    // rather than a loop, minimizing the number of ops required.
    if fp.exp >= F::MAX_EXPONENT {
        let diff = fp.exp - F::MAX_EXPONENT;
        if diff <= F::MANTISSA_SIZE {
            // Our overflow mask needs to start at the hidden bit, or at
            // `F::MANTISSA_SIZE+1`, and needs to have `diff+1` bits set,
            // to see if our value overflows.
            let bit = (F::MANTISSA_SIZE + 1) as u64;
            let n = (diff + 1) as u64;
            let mask = internal_n_mask(bit, n);
            if (fp.mant & mask) == 0 {
                // If we have no 1-bit in the hidden-bit position,
                // which is index 0, we need to shift 1.
                let shift = diff + 1;
                shl(fp, shift);
            }
        }
    }
}

// ROUND TO NATIVE

// Round an extended-precision float to a native float representation.
#[inline]
pub(crate) fn round_to_native<F, Algorithm>(fp: &mut ExtendedFloat, algorithm: Algorithm)
where
    F: Float,
    Algorithm: FnOnce(&mut ExtendedFloat, i32),
{
    // Shift all the way left, to ensure a consistent representation.
    // The following right-shifts do not work for a non-normalized number.
    fp.normalize();

    // Round so the fraction is in a native mantissa representation,
    // and avoid overflow/underflow.
    round_to_float::<F, _>(fp, algorithm);
    avoid_overflow::<F>(fp);
}