1 ///
2 /// @file  fast_div.hpp
3 /// @brief Integer division of small types is much faster than integer
4 ///        division of large types on most CPUs. The fast_div(x, y)
5 ///        function tries to take advantage of this by casting x and y
6 ///        to smaller types (if possible) before doing the division.
7 ///
8 ///        If ENABLE_DIV32 is defined we check at runtime if the
9 ///        dividend and divisor are < 2^32 and if so we use 32-bit
10 ///        integer division instead of 64-bit integer division. On
11 ///        most CPUs before 2020 this significantly improves
12 ///        performance.
13 ///
14 ///        On some new CPUs (such as Intel Cannonlake) 64-bit integer
15 ///        division has been improved significantly and runs as fast
16 ///        as 32-bit integer division. For such CPUs it is best to
17 ///        disable ENABLE_DIV32 (using cmake -DWITH_DIV32=OFF) as this
18 ///        avoids runtime checks for (64-bit / 32-bit) divisions.
19 ///
20 /// Copyright (C) 2020 Kim Walisch, <kim.walisch@gmail.com>
21 ///
22 /// This file is distributed under the BSD License. See the COPYING
23 /// file in the top level directory.
24 ///
25 
26 #ifndef FAST_DIV_HPP
27 #define FAST_DIV_HPP
28 
29 #include <cassert>
30 #include <limits>
31 #include <stdint.h>
32 #include <type_traits>
33 
34 namespace {
35 
36 /// If ENABLE_DIV32 is defined:
37 ///
38 /// 1) We use 32-bit integer division for (64-bit / 32-bit)
39 ///    if the dividend is < 2^32.
40 /// 2) We use 64-bit integer division for (64-bit / 64-bit).
41 /// 3) We use 64-bit integer division for (128-bit / 64-bit)
42 ///    if the dividend is < 2^64.
43 ///
44 #if defined(ENABLE_DIV32)
45 
46 /// Get the next smaller integer type
47 /// and convert it to unsigned.
48 /// make_smaller< uint64_t>::type -> uint32_t.
49 /// make_smaller<uint128_t>::type -> uint64_t.
50 ///
51 template <typename T>
52 struct make_smaller
53 {
54   using type = typename std::conditional<sizeof(T) / 2 <= sizeof(uint32_t), uint32_t,
55                typename std::conditional<sizeof(T) / 2 <= sizeof(uint64_t), uint64_t,
56                T>::type>::type;
57 };
58 
59 /// Used for (64-bit / 64-bit) = 64-bit.
60 template <typename X, typename Y>
61 typename std::enable_if<(sizeof(X) == sizeof(Y)), X>::type
fast_div(X x,Y y)62 fast_div(X x, Y y)
63 {
64   // Unsigned integer division is usually
65   // faster than signed integer division.
66   using UX = typename std::make_unsigned<X>::type;
67   return (UX) x / (UX) y;
68 }
69 
70 /// Used for  (64-bit / 32-bit) =  64-bit.
71 /// Used for (128-bit / 64-bit) = 128-bit.
72 template <typename X, typename Y>
73 typename std::enable_if<(sizeof(X) > sizeof(Y)), X>::type
fast_div(X x,Y y)74 fast_div(X x, Y y)
75 {
76   using smaller_t = typename make_smaller<X>::type;
77 
78   if (x <= std::numeric_limits<smaller_t>::max())
79     return (smaller_t) x / (smaller_t) y;
80   else
81   {
82     // Unsigned integer division is usually
83     // faster than signed integer division.
84     using UX = typename std::make_unsigned<X>::type;
85     using UY = typename std::make_unsigned<Y>::type;
86     return (UX) x / (UY) y;
87   }
88 }
89 
90 #else
91 
92 /// If ENABLE_DIV32 is not defined:
93 ///
94 /// 1) We use 64-bit integer division for (64-bit / 32-bit).
95 /// 2) We use 64-bit integer division for (64-bit / 64-bit).
96 /// 3) We use 64-bit integer division for (128-bit / 64-bit)
97 ///    if the dividend is < 2^64.
98 ///
99 
100 /// Get the next smaller integer type
101 /// and convert it to unsigned.
102 /// make_smaller<uint128_t>::type -> uint64_t.
103 ///
104 template <typename T>
105 struct make_smaller
106 {
107   using type = typename std::make_unsigned<
108                  typename std::conditional<
109                    sizeof(T) == sizeof(uint64_t) * 2,
110                      uint64_t, T>::type>::type;
111 };
112 
113 /// Used for (64-bit / 32-bit) = 64-bit.
114 /// Used for (64-bit / 64-bit) = 64-bit.
115 template <typename X, typename Y>
116 typename std::enable_if<(sizeof(X) >= sizeof(Y) &&
117                          sizeof(X) <= sizeof(uint64_t)), X>::type
118 fast_div(X x, Y y)
119 {
120   // Unsigned integer division is usually
121   // faster than signed integer division.
122   using UX = typename std::make_unsigned<X>::type;
123   return (UX) x / (UX) y;
124 }
125 
126 /// Used for (128-bit / 64-bit) = 128-bit.
127 template <typename X, typename Y>
128 typename std::enable_if<(sizeof(X) > sizeof(Y) &&
129                          sizeof(X) > sizeof(uint64_t)), X>::type
130 fast_div(X x, Y y)
131 {
132   using smaller_t = typename make_smaller<X>::type;
133 
134   if (x <= std::numeric_limits<smaller_t>::max())
135     return (smaller_t) x / (smaller_t) y;
136   else
137   {
138     // Unsigned integer division is usually
139     // faster than signed integer division.
140     using UX = typename std::make_unsigned<X>::type;
141     using UY = typename std::make_unsigned<Y>::type;
142     return (UX) x / (UY) y;
143   }
144 }
145 
146 #endif
147 
148 /// Used for (128-bit / 64-bit) = 64-bit.
149 /// Use this function only when you know for sure
150 /// that the result is < 2^64.
151 ///
152 template <typename X, typename Y>
153 typename std::enable_if<(sizeof(X) == sizeof(uint64_t) * 2 &&
154                          sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type
fast_div64(X x,Y y)155 fast_div64(X x, Y y)
156 {
157 #if defined(__x86_64__) && \
158    (defined(__GNUC__) || defined(__clang__))
159 
160   // primecount does not need signed division so
161   // we use the unsigned division instruction further
162   // down as DIV is usually faster than IDIV.
163   assert(x >= 0 && y > 0);
164 
165   uint64_t x0 = (uint64_t) x;
166   uint64_t x1 = ((uint64_t*) &x)[1];
167   uint64_t d = y;
168 
169   // (128-bit / 64-bit) = 64-bit.
170   // When we know the result fits into 64-bit (even
171   // though the numerator is 128-bit) we can use the divq
172   // instruction instead of doing a full 128-bit division.
173   __asm__("divq %[divider]"
174           : "+a"(x0), "+d"(x1) : [divider] "r"(d));
175 
176   return x0;
177 #else
178   return (uint64_t) fast_div(x, y);
179 #endif
180 }
181 
182 /// Used for (64-bit / 32-bit) = 64-bit.
183 /// Used for (64-bit / 64-bit) = 64-bit.
184 template <typename X, typename Y>
185 typename std::enable_if<!(sizeof(X) == sizeof(uint64_t) * 2 &&
186                           sizeof(Y) <= sizeof(uint64_t)), uint64_t>::type
fast_div64(X x,Y y)187 fast_div64(X x, Y y)
188 {
189   return (uint64_t) fast_div(x, y);
190 }
191 
192 } // namespace
193 
194 #endif
195