1 ///
2 /// @file fast_div.hpp
3 /// @brief Integer division of small types is much faster than integer
4 /// division of large types on most CPUs. The fast_div(x, y)
5 /// function tries to take advantage of this by casting x and y
6 /// to smaller types (if possible) before doing the division.
7 ///
8 /// If ENABLE_DIV32 is defined we check at runtime if the
9 /// dividend and divisor are < 2^32 and if so we use 32-bit
10 /// integer division instead of 64-bit integer division. On
11 /// most CPUs before 2020 this significantly improves
12 /// performance.
13 ///
14 /// On some new CPUs (such as Intel Cannonlake) 64-bit integer
15 /// division has been improved significantly and runs as fast
16 /// as 32-bit integer division. For such CPUs it is best to
17 /// disable ENABLE_DIV32 (using cmake -DWITH_DIV32=OFF) as this
18 /// avoids runtime checks for (64-bit / 32-bit) divisions.
19 ///
20 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
21 ///
22 /// This file is distributed under the BSD License. See the COPYING
23 /// file in the top level directory.
24 ///
25
26 #ifndef FAST_DIV_HPP
27 #define FAST_DIV_HPP
28
29 #include <cassert>
30 #include <limits>
31 #include <stdint.h>
32 #include <type_traits>
33
34 namespace {
35
36 /// If ENABLE_DIV32 is defined:
37 ///
38 /// 1) We use 32-bit integer division for (64-bit / 32-bit)
39 /// if the dividend is < 2^32.
40 /// 2) We use 64-bit integer division for (64-bit / 64-bit).
41 /// 3) We use 64-bit integer division for (128-bit / 64-bit)
42 /// if the dividend is < 2^64.
43 ///
44 #if defined(ENABLE_DIV32)
45
46 /// Get the next smaller integer type
47 /// and convert it to unsigned.
48 /// make_smaller< uint64_t>::type -> uint32_t.
49 /// make_smaller<uint128_t>::type -> uint64_t.
50 ///
51 template <typename T>
52 struct make_smaller
53 {
54 using type = typename std::conditional<sizeof(T) / 2 <= sizeof(uint32_t), uint32_t,
55 typename std::conditional<sizeof(T) / 2 <= sizeof(uint64_t), uint64_t,
56 T>::type>::type;
57 };
58
59 /// Used for (64-bit / 64-bit) = 64-bit.
60 template <typename X, typename Y>
61 typename std::enable_if<(sizeof(X) == sizeof(Y)), X>::type
fast_div(X x,Y y)62 fast_div(X x, Y y)
63 {
64 // Unsigned integer division is usually
65 // faster than signed integer division.
66 using UX = typename std::make_unsigned<X>::type;
67 return (UX) x / (UX) y;
68 }
69
70 /// Used for (64-bit / 32-bit) = 64-bit.
71 /// Used for (128-bit / 64-bit) = 128-bit.
72 template <typename X, typename Y>
73 typename std::enable_if<(sizeof(X) > sizeof(Y)), X>::type
fast_div(X x,Y y)74 fast_div(X x, Y y)
75 {
76 using smaller_t = typename make_smaller<X>::type;
77
78 if (x <= std::numeric_limits<smaller_t>::max())
79 return (smaller_t) x / (smaller_t) y;
80 else
81 {
82 // Unsigned integer division is usually
83 // faster than signed integer division.
84 using UX = typename std::make_unsigned<X>::type;
85 using UY = typename std::make_unsigned<Y>::type;
86 return (UX) x / (UY) y;
87 }
88 }
89
90 #else
91
92 /// If ENABLE_DIV32 is not defined:
93 ///
94 /// 1) We use 64-bit integer division for (64-bit / 32-bit).
95 /// 2) We use 64-bit integer division for (64-bit / 64-bit).
96 /// 3) We use 64-bit integer division for (128-bit / 64-bit)
97 /// if the dividend is < 2^64.
98 ///
99
100 /// Get the next smaller integer type
101 /// and convert it to unsigned.
102 /// make_smaller<uint128_t>::type -> uint64_t.
103 ///
104 template <typename T>
105 struct make_smaller
106 {
107 using type = typename std::make_unsigned<
108 typename std::conditional<
109 sizeof(T) == sizeof(uint64_t) * 2,
110 uint64_t, T>::type>::type;
111 };
112
113 /// Used for (64-bit / 32-bit) = 64-bit.
114 /// Used for (64-bit / 64-bit) = 64-bit.
115 template <typename X, typename Y>
116 typename std::enable_if<(sizeof(X) >= sizeof(Y) &&
117 sizeof(X) <= sizeof(uint64_t)), X>::type
118 fast_div(X x, Y y)
119 {
120 // Unsigned integer division is usually
121 // faster than signed integer division.
122 using UX = typename std::make_unsigned<X>::type;
123 return (UX) x / (UX) y;
124 }
125
126 /// Used for (128-bit / 64-bit) = 128-bit.
127 template <typename X, typename Y>
128 typename std::enable_if<(sizeof(X) > sizeof(Y) &&
129 sizeof(X) > sizeof(uint64_t)), X>::type
130 fast_div(X x, Y y)
131 {
132 using smaller_t = typename make_smaller<X>::type;
133
134 if (x <= std::numeric_limits<smaller_t>::max())
135 return (smaller_t) x / (smaller_t) y;
136 else
137 {
138 // Unsigned integer division is usually
139 // faster than signed integer division.
140 using UX = typename std::make_unsigned<X>::type;
141 using UY = typename std::make_unsigned<Y>::type;
142 return (UX) x / (UY) y;
143 }
144 }
145
146 #endif
147
148 /// Used for (128-bit / 64-bit) = 64-bit.
149 /// Use this function only when you know for sure
150 /// that the result is < 2^64.
151 ///
152 template <typename X, typename Y>
153 typename std::enable_if<(sizeof(X) == sizeof(uint64_t) * 2 &&
154 sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type
fast_div64(X x,Y y)155 fast_div64(X x, Y y)
156 {
157 #if defined(__x86_64__) && \
158 (defined(__GNUC__) || defined(__clang__))
159
160 // primecount does not need signed division so
161 // we use the unsigned division instruction further
162 // down as DIV is usually faster than IDIV.
163 assert(x >= 0 && y > 0);
164
165 uint64_t x0 = (uint64_t) x;
166 uint64_t x1 = ((uint64_t*) &x)[1];
167 uint64_t d = y;
168
169 // (128-bit / 64-bit) = 64-bit.
170 // When we know the result fits into 64-bit (even
171 // though the numerator is 128-bit) we can use the divq
172 // instruction instead of doing a full 128-bit division.
173 __asm__("divq %[divider]"
174 : "+a"(x0), "+d"(x1) : [divider] "r"(d));
175
176 return x0;
177 #else
178 return (uint64_t) fast_div(x, y);
179 #endif
180 }
181
182 /// Used for (64-bit / 32-bit) = 64-bit.
183 /// Used for (64-bit / 64-bit) = 64-bit.
184 template <typename X, typename Y>
185 typename std::enable_if<!(sizeof(X) == sizeof(uint64_t) * 2 &&
186 sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type
fast_div64(X x,Y y)187 fast_div64(X x, Y y)
188 {
189 return (uint64_t) fast_div(x, y);
190 }
191
192 } // namespace
193
194 #endif
195