1 // libdivide.h
2 // Copyright 2010 - 2016 ridiculous_fish
3
4 /*
5 libdivide
6 Copyright (C) 2010 ridiculous_fish
7
8 This software is provided 'as-is', without any express or implied
9 warranty. In no event will the authors be held liable for any damages
10 arising from the use of this software.
11
12 Permission is granted to anyone to use this software for any purpose,
13 including commercial applications, and to alter it and redistribute it
14 freely, subject to the following restrictions:
15
16 1. The origin of this software must not be misrepresented; you must not
17 claim that you wrote the original software. If you use this software
18 in a product, an acknowledgment in the product documentation would be
19 appreciated but is not required.
20 2. Altered source versions must be plainly marked as such, and must not be
21 misrepresented as being the original software.
22 3. This notice may not be removed or altered from any source distribution.
23
24 libdivide@ridiculousfish.com
25 */
26 /* modified from https://github.com/ridiculousfish/libdivide for click */
27
28 #ifndef CLICK_LIBDIVIDE_H
29 #define CLICK_LIBDIVIDE_H
30 #include <click/glue.hh>
31 #include <click/integers.hh>
32 CLICK_DECLS
33
34 #ifndef __has_builtin
35 #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
36 #endif
37
38 #if defined(__SIZEOF_INT128__)
39 #define HAS_INT128_T 1
40 #endif
41
42 #if defined(__x86_64__) || defined(_WIN64) || defined(_M_X64)
43 #define LIBDIVIDE_IS_X86_64 1
44 #endif
45
46 #if defined(__i386__)
47 #define LIBDIVIDE_IS_i386 1
48 #endif
49
50 #if __GNUC__ || __clang__
51 #define LIBDIVIDE_GCC_STYLE_ASM 1
52 #endif
53
54 #define LIBDIVIDE_ASSERT(x) assert(x)
55
56 // Explanation of "more" field: bit 6 is whether to use shift path. If we are
57 // using the shift path, bit 7 is whether the divisor is negative in the signed
58 // case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift
59 // path or mult path). In 32 bit case, bit 5 is always 0. We use bit 7 as the
60 // "negative divisor indicator" so that we can use sign extension to
61 // efficiently go to a full-width -1.
62 //
63 // u32: [0-4] shift value
64 // [5] ignored
65 // [6] add indicator
66 // [7] shift path
67 //
68 // s32: [0-4] shift value
69 // [5] shift path
70 // [6] add indicator
71 // [7] indicates negative divisor
72 //
73 // u64: [0-5] shift value
74 // [6] add indicator
75 // [7] shift path
76 //
77 // s64: [0-5] shift value
78 // [6] add indicator
79 // [7] indicates negative divisor
80 // magic number of 0 indicates shift path (we ran out of bits!)
81 //
82 // In s32 and s64 branchfree modes, the magic number is negated according to
83 // whether the divisor is negated. In branchfree strategy, it is not negated.
84
85 enum {
86 LIBDIVIDE_32_SHIFT_MASK = 0x1F,
87 LIBDIVIDE_64_SHIFT_MASK = 0x3F,
88 LIBDIVIDE_ADD_MARKER = 0x40,
89 LIBDIVIDE_U32_SHIFT_PATH = 0x80,
90 LIBDIVIDE_U64_SHIFT_PATH = 0x80,
91 LIBDIVIDE_S32_SHIFT_PATH = 0x20,
92 LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
93 };
94
95 struct libdivide_u32_t {
96 uint32_t magic;
97 uint8_t more;
98 };
99
100 struct libdivide_u32_branchfree_t {
101 uint32_t magic;
102 uint8_t more;
103 };
104
105 #ifndef LIBDIVIDE_API
106 #define LIBDIVIDE_API static inline
107 #endif
108
109 LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y);
110
111 LIBDIVIDE_API struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t y);
112
113 LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
114
115 LIBDIVIDE_API uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
116
117 LIBDIVIDE_API uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
118
119 LIBDIVIDE_API uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom);
120
121 LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom);
122 LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom);
123 LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom);
124 LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom);
125
126
127 //////// Internal Utility Functions
128
libdivide__mullhi_u32(uint32_t x,uint32_t y)129 static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
130 uint64_t xl = x, yl = y;
131 uint64_t rl = xl * yl;
132 return (uint32_t)(rl >> 32);
133 }
134
libdivide__count_leading_zeros32(uint32_t val)135 static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
136 #if __GNUC__ || __has_builtin(__builtin_clz)
137 // Fast way to count leading zeros
138 return __builtin_clz(val);
139 #else
140 int32_t result = 0;
141 uint32_t hi = 1U << 31;
142
143 while (~val & hi) {
144 hi >>= 1;
145 result++;
146 }
147 return result;
148 #endif
149 }
150
libdivide__count_leading_zeros64(uint64_t val)151 static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
152 #if __GNUC__ || __has_builtin(__builtin_clzll)
153 // Fast way to count leading zeros
154 return __builtin_clzll(val);
155 #else
156 uint32_t hi = val >> 32;
157 uint32_t lo = val & 0xFFFFFFFF;
158 if (hi != 0) return libdivide__count_leading_zeros32(hi);
159 return 32 + libdivide__count_leading_zeros32(lo);
160 #endif
161 }
162
163 // libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit
164 // uint {v}. The result must fit in 32 bits.
165 // Returns the quotient directly and the remainder in *r
166 #if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM
libdivide_64_div_32_to_32(uint32_t u1,uint32_t u0,uint32_t v,uint32_t * r)167 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
168 uint32_t result;
169 __asm__("divl %[v]"
170 : "=a"(result), "=d"(*r)
171 : [v] "r"(v), "a"(u0), "d"(u1)
172 );
173 return result;
174 }
175 #else
libdivide_64_div_32_to_32(uint32_t u1,uint32_t u0,uint32_t v,uint32_t * r)176 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
177 uint64_t n = (((uint64_t)u1) << 32) | u0;
178 uint32_t result = (uint32_t)int_divide(n, v);
179 *r = (uint32_t)(n - result * (uint64_t)v);
180 return result;
181 }
182 #endif
183
184 ////////// UINT32
185
libdivide_internal_u32_gen(uint32_t d,int branchfree)186 static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
187 // 1 is not supported with branchfree algorithm
188 LIBDIVIDE_ASSERT(!branchfree || d != 1);
189
190 struct libdivide_u32_t result;
191 const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
192 if ((d & (d - 1)) == 0) {
193 // Power of 2
194 if (! branchfree) {
195 result.magic = 0;
196 result.more = floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH;
197 } else {
198 // We want a magic number of 2**32 and a shift of floor_log_2_d
199 // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER, so we
200 // subtract 1 from the shift
201 result.magic = 0;
202 result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER;
203 }
204 } else {
205 uint8_t more;
206 uint32_t rem, proposed_m;
207 proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
208
209 LIBDIVIDE_ASSERT(rem > 0 && rem < d);
210 const uint32_t e = d - rem;
211
212 // This power works if e < 2**floor_log_2_d.
213 if (!branchfree && (e < (1U << floor_log_2_d))) {
214 // This power works
215 more = floor_log_2_d;
216 } else {
217 // We have to use the general 33-bit algorithm. We need to compute
218 // (2**power) / d. However, we already have (2**(power-1))/d and
219 // its remainder. By doubling both, and then correcting the
220 // remainder, we can compute the larger division.
221 // don't care about overflow here - in fact, we expect it
222 proposed_m += proposed_m;
223 const uint32_t twice_rem = rem + rem;
224 if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
225 more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
226 }
227 result.magic = 1 + proposed_m;
228 result.more = more;
229 // result.more's shift should in general be ceil_log_2_d. But if we
230 // used the smaller power, we subtract one from the shift because we're
231 // using the smaller power. If we're using the larger power, we
232 // subtract one from the shift because it's taken care of by the add
233 // indicator. So floor_log_2_d happens to be correct in both cases.
234 }
235 return result;
236 }
237
libdivide_u32_gen(uint32_t d)238 struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
239 return libdivide_internal_u32_gen(d, 0);
240 }
241
libdivide_u32_branchfree_gen(uint32_t d)242 struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
243 struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
244 struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
245 return ret;
246 }
247
libdivide_u32_do(uint32_t numer,const struct libdivide_u32_t * denom)248 uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
249 uint8_t more = denom->more;
250 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
251 return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
252 }
253 else {
254 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
255 if (more & LIBDIVIDE_ADD_MARKER) {
256 uint32_t t = ((numer - q) >> 1) + q;
257 return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
258 }
259 else {
260 return q >> more; // all upper bits are 0 - don't need to mask them off
261 }
262 }
263 }
264
libdivide_u32_recover(const struct libdivide_u32_t * denom)265 uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
266 uint8_t more = denom->more;
267 uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
268 if (more & LIBDIVIDE_U32_SHIFT_PATH) {
269 return 1U << shift;
270 } else if (! (more & LIBDIVIDE_ADD_MARKER)) {
271 // We compute q = n/d = n*m / 2^(32 + shift)
272 // Therefore we have d = 2^(32 + shift) / m
273 // We need to ceil it.
274 // We know d is not a power of 2, so m is not a power of 2,
275 // so we can just add 1 to the floor
276 uint32_t hi_dividend = 1U << shift;
277 uint32_t rem_ignored;
278 return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
279 } else {
280 // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
281 // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
282 // Also note that shift may be as high as 31, so shift + 1 will
283 // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
284 // then double the quotient and remainder.
285 // TODO: do something better than 64 bit math
286 uint64_t half_n = 1ULL << (32 + shift);
287 uint64_t d = (1ULL << 32) | denom->magic;
288 // Note that the quotient is guaranteed <= 32 bits, but the remainder
289 // may need 33!
290 uint32_t half_q = (uint32_t)int_divide(half_n, d);
291 uint64_t rem = half_n - half_q * d; // broken
292 // We computed 2^(32+shift)/(m+2^32)
293 // Need to double it, and then add 1 to the quotient if doubling th
294 // remainder would increase the quotient.
295 // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
296 uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
297
298 // We rounded down in gen unless we're a power of 2 (i.e. in branchfree case)
299 // We can detect that by looking at m. If m zero, we're a power of 2
300 return full_q + (denom->magic != 0);
301 }
302 }
303
libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t * denom)304 uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
305 struct libdivide_u32_t denom_u32 = {denom->magic, (uint8_t)(denom->more | LIBDIVIDE_ADD_MARKER)};
306 return libdivide_u32_recover(&denom_u32);
307 }
308
libdivide_u32_get_algorithm(const struct libdivide_u32_t * denom)309 int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
310 uint8_t more = denom->more;
311 if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0;
312 else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1;
313 else return 2;
314 }
315
libdivide_u32_do_alg0(uint32_t numer,const struct libdivide_u32_t * denom)316 uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
317 return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
318 }
319
libdivide_u32_do_alg1(uint32_t numer,const struct libdivide_u32_t * denom)320 uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
321 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
322 return q >> denom->more;
323 }
324
libdivide_u32_do_alg2(uint32_t numer,const struct libdivide_u32_t * denom)325 uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
326 // denom->add != 0
327 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
328 uint32_t t = ((numer - q) >> 1) + q;
329 // Note that this mask is typically free. Only the low bits are meaningful
330 // to a shift, so compilers can optimize out this AND.
331 return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
332 }
333
libdivide_u32_branchfree_do(uint32_t numer,const struct libdivide_u32_branchfree_t * denom)334 uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
335 // same as alg 2
336 uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
337 uint32_t t = ((numer - q) >> 1) + q;
338 return t >> denom->more;
339 }
340
341 CLICK_ENDDECLS
342 #endif
343