1 //! Optimized division algorithms for u128.
2 //!
3 //! The code in this module is derived off of `dtolnay/itoa`
4 //! and Rust's compiler-builtins crate. This copies a specific
5 //! path of LLVM's `__udivmodti4` intrinsic, which does division/
6 //! modulus for u128 in a single step. Rust implements both division
7 //! and modulus in terms of this intrinsic, but calls the intrinsic
8 //! twice for subsequent division and modulus operations on the same
9 //! dividend/divisor, leading to significant performance overhead.
10 //!
11 //! This module calculates the optimal divisors for each radix,
12 //! and exports a general-purpose division algorithm for u128 where
13 //! the divisor can fit in a u64.
14 //!
15 //! This implementation is derived from dtolnay/itoa, which can be found here:
16 //!     https://github.com/dtolnay/itoa/blob/master/src/udiv128.rs
17 //!
18 //! This implementation is also derived from Rust's compiler-builtins crate,
19 //! which can be found here:
20 //!     https://github.com/rust-lang-nursery/compiler-builtins/blob/master/src/int/udiv.rs
21 //!
22 //! Licensing for this module may be under the MIT or Illinois license
23 //! (a BSD-like license), and may be found here:
24 //!     https://github.com/rust-lang-nursery/compiler-builtins/blob/master/LICENSE.TXT
25 
26 // Get the divisor for optimized 128-bit division.
27 // Returns the divisor, the number of digits processed, and the
28 // number of leading zeros in the divisor.
29 //
30 // These values were calculated using the following script:
31 //
32 //  ```text
33 //  import math
34 //
35 //  u64_max = 2**64 - 1
36 //  u128_max = 2**128-1
37 //
38 //  def is_valid(x):
39 //      return (
40 //          x <= u64_max
41 //          and (u128_max / (x**2)) < x
42 //      )
43 //
44 //  def find_pow(radix):
45 //      start_pow = int(math.floor(math.log(u64_max, radix))) - 1
46 //      while is_valid(radix**start_pow):
47 //          start_pow += 1
48 //      return start_pow - 1
49 //
50 //  for radix in range(2, 37):
51 //      power = find_pow(radix)
52 //      print(radix, radix**power, power)
53 //  ```
54 #[cfg(feature = "radix")]
55 #[inline]
u128_divisor(radix: u32) -> (u64, usize, u32)56 pub(crate) fn u128_divisor(radix: u32) -> (u64, usize, u32) {
57     match radix {
58         2  => (9223372036854775808, 63, 0),    // 2^63
59         3  => (12157665459056928801, 40, 0),   // 3^40
60         4  => (4611686018427387904, 31, 1),    // 4^31
61         5  => (7450580596923828125, 27, 1),    // 5^27
62         6  => (4738381338321616896, 24, 1),    // 6^24
63         7  => (3909821048582988049, 22, 2),    // 7^22
64         8  => (9223372036854775808, 21, 0),    // 8^21
65         9  => (12157665459056928801, 20, 0),   // 9^20
66         10 => (10000000000000000000, 19, 0),   // 10^19
67         11 => (5559917313492231481, 18, 1),    // 11^18
68         12 => (2218611106740436992, 17, 3),    // 12^17
69         13 => (8650415919381337933, 17, 1),    // 13^17
70         14 => (2177953337809371136, 16, 3),    // 14^16
71         15 => (6568408355712890625, 16, 1),    // 15^16
72         16 => (1152921504606846976, 15, 3),    // 16^15
73         17 => (2862423051509815793, 15, 2),    // 17^15
74         18 => (6746640616477458432, 15, 1),    // 18^15
75         19 => (15181127029874798299, 15, 0),   // 19^15
76         20 => (1638400000000000000, 14, 3),    // 20^14
77         21 => (3243919932521508681, 14, 2),    // 21^14
78         22 => (6221821273427820544, 14, 1),    // 22^14
79         23 => (11592836324538749809, 14, 0),   // 23^14
80         24 => (876488338465357824, 13, 4),     // 24^13
81         25 => (1490116119384765625, 13, 3),    // 25^13
82         26 => (2481152873203736576, 13, 2),    // 26^13
83         27 => (4052555153018976267, 13, 2),    // 27^13
84         28 => (6502111422497947648, 13, 1),    // 28^13
85         29 => (10260628712958602189, 13, 0),   // 29^13
86         30 => (15943230000000000000, 13, 0),   // 30^13
87         31 => (787662783788549761, 12, 4),     // 31^12
88         32 => (1152921504606846976, 12, 3),    // 32^12
89         33 => (1667889514952984961, 12, 3),    // 33^12
90         34 => (2386420683693101056, 12, 2),    // 34^12
91         35 => (3379220508056640625, 12, 2),    // 35^12
92         36 => (4738381338321616896, 12, 1),    // 36^12
93         _  => unreachable!(),
94     }
95 }
96 
97 // Get the divisor for optimized 128-bit division.
98 // Returns the divisor, the number of digits processed, and the
99 // number of leading zeros in the divisor.
100 #[cfg(not(feature = "radix"))]
101 #[inline]
102 #[allow(dead_code)]
u128_divisor(_: u32) -> (u64, usize, u32)103 pub(crate) fn u128_divisor(_: u32) -> (u64, usize, u32) {
104     (10000000000000000000, 19, 0)              // 10^19
105 }
106 
107 // Optimized division/remainder algorithm for u128.
108 // This is because the codegen for u128 divrem is very inefficient in Rust,
109 // calling both `__udivmodti4` twice internally, rather than a single time.
110 #[inline]
u128_divrem(n: u128, d: u64, d_cltz: u32) -> (u128, u64)111 pub(crate) fn u128_divrem(n: u128, d: u64, d_cltz: u32) -> (u128, u64) {
112     // Ensure we have the correct number of leading zeros passed.
113     debug_assert_eq!(d_cltz, d.leading_zeros());
114 
115     // Optimize if we can divide using u64 first.
116     let high = (n >> 64) as u64;
117     if high == 0 {
118         let low = n as u64;
119         return ((low / d) as u128, low % d);
120     }
121 
122     // sr = 1 + u64::BITS + d.leading_zeros() - high.leading_zeros();
123     let sr = 65 + d_cltz - high.leading_zeros();
124 
125     // 1 <= sr <= u64::BITS - 1
126     let mut q: u128 = n << (128 - sr);
127     let mut r: u128 = n >> sr;
128     let mut carry: u64 = 0;
129 
130     // Don't use a range because they may generate references to memcpy in unoptimized code
131     // Loop invariants:  r < d; carry is 0 or 1
132     let mut i = 0;
133     while i < sr {
134         i += 1;
135 
136         // r:q = ((r:q) << 1) | carry
137         r = (r << 1) | (q >> 127);
138         q = (q << 1) | carry as u128;
139 
140         // carry = 0
141         // if r >= d {
142         //     r -= d;
143         //     carry = 1;
144         // }
145         let s = (d as u128).wrapping_sub(r).wrapping_sub(1) as i128 >> 127;
146         carry = (s & 1) as u64;
147         r -= (d as u128) & s as u128;
148     }
149 
150     ((q << 1) | carry as u128, r as u64)
151 }
152 
153 // Divide by 1e19 for base10 algorithms.
154 #[cfg(feature = "table")]
u128_divrem_1e19(n: u128) -> (u128, u64)155 pub(crate) fn u128_divrem_1e19(n: u128) -> (u128, u64) {
156     u128_divrem(n, 10000000000000000000, 0)
157 }
158 
159 // TESTS
160 // -----
161 
162 #[cfg(test)]
163 mod tests {
164     use super::*;
165 
166     #[cfg(all(feature = "std", feature = "property_tests"))]
167     use proptest::{proptest, prop_assert_eq, prop_assert};
168 
169     #[cfg(all(feature = "std", feature = "property_tests"))]
170     proptest! {
171         #[test]
172         fn u128_divrem_proptest(i in u128::min_value()..u128::max_value()) {
173             let (d, _, d_cltz) = u128_divisor(10);
174             let expected = (i / d as u128, (i % d as u128) as u64);
175             let actual = u128_divrem(i, d, d_cltz);
176             prop_assert_eq!(actual, expected);
177         }
178     }
179 }
180