1 // libdivide.h
2 // Copyright 2010 - 2016 ridiculous_fish
3 
4 /*
5   libdivide
6   Copyright (C) 2010 ridiculous_fish
7 
8   This software is provided 'as-is', without any express or implied
9   warranty.  In no event will the authors be held liable for any damages
10   arising from the use of this software.
11 
12   Permission is granted to anyone to use this software for any purpose,
13   including commercial applications, and to alter it and redistribute it
14   freely, subject to the following restrictions:
15 
16   1. The origin of this software must not be misrepresented; you must not
17      claim that you wrote the original software. If you use this software
18      in a product, an acknowledgment in the product documentation would be
19      appreciated but is not required.
20   2. Altered source versions must be plainly marked as such, and must not be
21      misrepresented as being the original software.
22   3. This notice may not be removed or altered from any source distribution.
23 
24 libdivide@ridiculousfish.com
25 */
26 /* modified from https://github.com/ridiculousfish/libdivide for click */
27 
28 #ifndef CLICK_LIBDIVIDE_H
29 #define CLICK_LIBDIVIDE_H
30 #include <click/glue.hh>
31 #include <click/integers.hh>
32 CLICK_DECLS
33 
34 #ifndef __has_builtin
35 #define __has_builtin(x) 0 // Compatibility with non-clang compilers.
36 #endif
37 
38 #if defined(__SIZEOF_INT128__)
39 #define HAS_INT128_T 1
40 #endif
41 
42 #if defined(__x86_64__) || defined(_WIN64) || defined(_M_X64)
43 #define LIBDIVIDE_IS_X86_64 1
44 #endif
45 
46 #if defined(__i386__)
47 #define LIBDIVIDE_IS_i386 1
48 #endif
49 
50 #if __GNUC__ || __clang__
51 #define LIBDIVIDE_GCC_STYLE_ASM 1
52 #endif
53 
54 #define LIBDIVIDE_ASSERT(x) assert(x)
55 
56 // Explanation of "more" field: bit 6 is whether to use shift path. If we are
57 // using the shift path, bit 7 is whether the divisor is negative in the signed
58 // case; in the unsigned case it is 0. Bits 0-4 is shift value (for shift
59 // path or mult path).  In 32 bit case, bit 5 is always 0. We use bit 7 as the
60 // "negative divisor indicator" so that we can use sign extension to
61 // efficiently go to a full-width -1.
62 //
63 // u32: [0-4] shift value
64 //      [5] ignored
65 //      [6] add indicator
66 //      [7] shift path
67 //
68 // s32: [0-4] shift value
69 //      [5] shift path
70 //      [6] add indicator
71 //      [7] indicates negative divisor
72 //
73 // u64: [0-5] shift value
74 //      [6] add indicator
75 //      [7] shift path
76 //
77 // s64: [0-5] shift value
78 //      [6] add indicator
79 //      [7] indicates negative divisor
80 //      magic number of 0 indicates shift path (we ran out of bits!)
81 //
82 // In s32 and s64 branchfree modes, the magic number is negated according to
83 // whether the divisor is negated. In branchfree strategy, it is not negated.
84 
85 enum {
86     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
87     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
88     LIBDIVIDE_ADD_MARKER = 0x40,
89     LIBDIVIDE_U32_SHIFT_PATH = 0x80,
90     LIBDIVIDE_U64_SHIFT_PATH = 0x80,
91     LIBDIVIDE_S32_SHIFT_PATH = 0x20,
92     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
93 };
94 
95 struct libdivide_u32_t {
96     uint32_t magic;
97     uint8_t more;
98 };
99 
100 struct libdivide_u32_branchfree_t {
101     uint32_t magic;
102     uint8_t more;
103 };
104 
105 #ifndef LIBDIVIDE_API
106 #define LIBDIVIDE_API static inline
107 #endif
108 
109 LIBDIVIDE_API struct libdivide_u32_t libdivide_u32_gen(uint32_t y);
110 
111 LIBDIVIDE_API struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t y);
112 
113 LIBDIVIDE_API uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom);
114 
115 LIBDIVIDE_API uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
116 
117 LIBDIVIDE_API uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
118 
119 LIBDIVIDE_API uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom);
120 
121 LIBDIVIDE_API int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom);
122 LIBDIVIDE_API uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom);
123 LIBDIVIDE_API uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom);
124 LIBDIVIDE_API uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom);
125 
126 
127 //////// Internal Utility Functions
128 
libdivide__mullhi_u32(uint32_t x,uint32_t y)129 static inline uint32_t libdivide__mullhi_u32(uint32_t x, uint32_t y) {
130     uint64_t xl = x, yl = y;
131     uint64_t rl = xl * yl;
132     return (uint32_t)(rl >> 32);
133 }
134 
libdivide__count_leading_zeros32(uint32_t val)135 static inline int32_t libdivide__count_leading_zeros32(uint32_t val) {
136 #if __GNUC__ || __has_builtin(__builtin_clz)
137     // Fast way to count leading zeros
138     return __builtin_clz(val);
139 #else
140   int32_t result = 0;
141   uint32_t hi = 1U << 31;
142 
143   while (~val & hi) {
144       hi >>= 1;
145       result++;
146   }
147   return result;
148 #endif
149 }
150 
libdivide__count_leading_zeros64(uint64_t val)151 static inline int32_t libdivide__count_leading_zeros64(uint64_t val) {
152 #if __GNUC__ || __has_builtin(__builtin_clzll)
153     // Fast way to count leading zeros
154     return __builtin_clzll(val);
155 #else
156     uint32_t hi = val >> 32;
157     uint32_t lo = val & 0xFFFFFFFF;
158     if (hi != 0) return libdivide__count_leading_zeros32(hi);
159     return 32 + libdivide__count_leading_zeros32(lo);
160 #endif
161 }
162 
163 // libdivide_64_div_32_to_32: divides a 64 bit uint {u1, u0} by a 32 bit
164 // uint {v}. The result must fit in 32 bits.
165 // Returns the quotient directly and the remainder in *r
166 #if (LIBDIVIDE_IS_i386 || LIBDIVIDE_IS_X86_64) && LIBDIVIDE_GCC_STYLE_ASM
libdivide_64_div_32_to_32(uint32_t u1,uint32_t u0,uint32_t v,uint32_t * r)167 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
168     uint32_t result;
169     __asm__("divl %[v]"
170             : "=a"(result), "=d"(*r)
171             : [v] "r"(v), "a"(u0), "d"(u1)
172             );
173     return result;
174 }
175 #else
libdivide_64_div_32_to_32(uint32_t u1,uint32_t u0,uint32_t v,uint32_t * r)176 static uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
177     uint64_t n = (((uint64_t)u1) << 32) | u0;
178     uint32_t result = (uint32_t)int_divide(n, v);
179     *r = (uint32_t)(n - result * (uint64_t)v);
180     return result;
181 }
182 #endif
183 
184 ////////// UINT32
185 
libdivide_internal_u32_gen(uint32_t d,int branchfree)186 static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) {
187     // 1 is not supported with branchfree algorithm
188     LIBDIVIDE_ASSERT(!branchfree || d != 1);
189 
190     struct libdivide_u32_t result;
191     const uint32_t floor_log_2_d = 31 - libdivide__count_leading_zeros32(d);
192     if ((d & (d - 1)) == 0) {
193         // Power of 2
194         if (! branchfree) {
195             result.magic = 0;
196             result.more = floor_log_2_d | LIBDIVIDE_U32_SHIFT_PATH;
197         } else {
198             // We want a magic number of 2**32 and a shift of floor_log_2_d
199             // but one of the shifts is taken up by LIBDIVIDE_ADD_MARKER, so we
200             // subtract 1 from the shift
201             result.magic = 0;
202             result.more = (floor_log_2_d-1) | LIBDIVIDE_ADD_MARKER;
203         }
204     } else {
205         uint8_t more;
206         uint32_t rem, proposed_m;
207         proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem);
208 
209         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
210         const uint32_t e = d - rem;
211 
212         // This power works if e < 2**floor_log_2_d.
213         if (!branchfree && (e < (1U << floor_log_2_d))) {
214             // This power works
215             more = floor_log_2_d;
216         } else {
217             // We have to use the general 33-bit algorithm.  We need to compute
218             // (2**power) / d. However, we already have (2**(power-1))/d and
219             // its remainder.  By doubling both, and then correcting the
220             // remainder, we can compute the larger division.
221             // don't care about overflow here - in fact, we expect it
222             proposed_m += proposed_m;
223             const uint32_t twice_rem = rem + rem;
224             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
225             more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
226         }
227         result.magic = 1 + proposed_m;
228         result.more = more;
229         // result.more's shift should in general be ceil_log_2_d. But if we
230         // used the smaller power, we subtract one from the shift because we're
231         // using the smaller power. If we're using the larger power, we
232         // subtract one from the shift because it's taken care of by the add
233         // indicator. So floor_log_2_d happens to be correct in both cases.
234     }
235     return result;
236 }
237 
libdivide_u32_gen(uint32_t d)238 struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
239     return libdivide_internal_u32_gen(d, 0);
240 }
241 
libdivide_u32_branchfree_gen(uint32_t d)242 struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
243     struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
244     struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
245     return ret;
246 }
247 
libdivide_u32_do(uint32_t numer,const struct libdivide_u32_t * denom)248 uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
249     uint8_t more = denom->more;
250     if (more & LIBDIVIDE_U32_SHIFT_PATH) {
251         return numer >> (more & LIBDIVIDE_32_SHIFT_MASK);
252     }
253     else {
254         uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
255         if (more & LIBDIVIDE_ADD_MARKER) {
256             uint32_t t = ((numer - q) >> 1) + q;
257             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
258         }
259         else {
260             return q >> more; // all upper bits are 0 - don't need to mask them off
261         }
262     }
263 }
264 
libdivide_u32_recover(const struct libdivide_u32_t * denom)265 uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
266     uint8_t more = denom->more;
267     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
268     if (more & LIBDIVIDE_U32_SHIFT_PATH) {
269         return 1U << shift;
270     } else if (! (more & LIBDIVIDE_ADD_MARKER)) {
271         // We compute q = n/d = n*m / 2^(32 + shift)
272         // Therefore we have d = 2^(32 + shift) / m
273         // We need to ceil it.
274         // We know d is not a power of 2, so m is not a power of 2,
275         // so we can just add 1 to the floor
276         uint32_t hi_dividend = 1U << shift;
277         uint32_t rem_ignored;
278         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
279     } else {
280         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
281         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
282         // Also note that shift may be as high as 31, so shift + 1 will
283         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
284         // then double the quotient and remainder.
285         // TODO: do something better than 64 bit math
286         uint64_t half_n = 1ULL << (32 + shift);
287         uint64_t d = (1ULL << 32) | denom->magic;
288         // Note that the quotient is guaranteed <= 32 bits, but the remainder
289         // may need 33!
290         uint32_t half_q = (uint32_t)int_divide(half_n, d);
291         uint64_t rem = half_n - half_q * d; // broken
292         // We computed 2^(32+shift)/(m+2^32)
293         // Need to double it, and then add 1 to the quotient if doubling th
294         // remainder would increase the quotient.
295         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
296         uint32_t full_q = half_q + half_q + ((rem<<1) >= d);
297 
298         // We rounded down in gen unless we're a power of 2 (i.e. in branchfree case)
299         // We can detect that by looking at m. If m zero, we're a power of 2
300         return full_q + (denom->magic != 0);
301     }
302 }
303 
libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t * denom)304 uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
305     struct libdivide_u32_t denom_u32 = {denom->magic, (uint8_t)(denom->more | LIBDIVIDE_ADD_MARKER)};
306     return libdivide_u32_recover(&denom_u32);
307 }
308 
libdivide_u32_get_algorithm(const struct libdivide_u32_t * denom)309 int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) {
310     uint8_t more = denom->more;
311     if (more & LIBDIVIDE_U32_SHIFT_PATH) return 0;
312     else if (! (more & LIBDIVIDE_ADD_MARKER)) return 1;
313     else return 2;
314 }
315 
libdivide_u32_do_alg0(uint32_t numer,const struct libdivide_u32_t * denom)316 uint32_t libdivide_u32_do_alg0(uint32_t numer, const struct libdivide_u32_t *denom) {
317     return numer >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
318 }
319 
libdivide_u32_do_alg1(uint32_t numer,const struct libdivide_u32_t * denom)320 uint32_t libdivide_u32_do_alg1(uint32_t numer, const struct libdivide_u32_t *denom) {
321     uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
322     return q >> denom->more;
323 }
324 
libdivide_u32_do_alg2(uint32_t numer,const struct libdivide_u32_t * denom)325 uint32_t libdivide_u32_do_alg2(uint32_t numer, const struct libdivide_u32_t *denom) {
326     // denom->add != 0
327     uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
328     uint32_t t = ((numer - q) >> 1) + q;
329     // Note that this mask is typically free. Only the low bits are meaningful
330     // to a shift, so compilers can optimize out this AND.
331     return t >> (denom->more & LIBDIVIDE_32_SHIFT_MASK);
332 }
333 
libdivide_u32_branchfree_do(uint32_t numer,const struct libdivide_u32_branchfree_t * denom)334 uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
335     // same as alg 2
336     uint32_t q = libdivide__mullhi_u32(denom->magic, numer);
337     uint32_t t = ((numer - q) >> 1) + q;
338     return t >> denom->more;
339 }
340 
341 CLICK_ENDDECLS
342 #endif
343