1 // libdivide.h - Optimized integer division
2 // https://libdivide.com
3 //
4 // Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
5 // Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
6 //
7 // libdivide is dual-licensed under the Boost or zlib licenses.
8 // You may use libdivide under the terms of either of these.
9 // See LICENSE.txt for more details.
10 
11 #ifndef LIBDIVIDE_H
12 #define LIBDIVIDE_H
13 
14 #define LIBDIVIDE_VERSION "5.0"
15 #define LIBDIVIDE_VERSION_MAJOR 5
16 #define LIBDIVIDE_VERSION_MINOR 0
17 
18 #include <stdint.h>
19 #if !defined(__AVR__)
20 #include <stdio.h>
21 #include <stdlib.h>
22 #endif
23 
24 #if defined(LIBDIVIDE_SSE2)
25 #include <emmintrin.h>
26 #endif
27 #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
28 #include <immintrin.h>
29 #endif
30 #if defined(LIBDIVIDE_NEON)
31 #include <arm_neon.h>
32 #endif
33 
34 #if defined(_MSC_VER)
35 #include <intrin.h>
36 #pragma warning(push)
37 // disable warning C4146: unary minus operator applied
38 // to unsigned type, result still unsigned
39 #pragma warning(disable : 4146)
40 // disable warning C4204: nonstandard extension used : non-constant aggregate
41 // initializer
42 //
43 // It's valid C99
44 #pragma warning(disable : 4204)
45 #define LIBDIVIDE_VC
46 #endif
47 
48 #if !defined(__has_builtin)
49 #define __has_builtin(x) 0
50 #endif
51 
52 #if defined(__SIZEOF_INT128__)
53 #define HAS_INT128_T
54 // clang-cl on Windows does not yet support 128-bit division
55 #if !(defined(__clang__) && defined(LIBDIVIDE_VC))
56 #define HAS_INT128_DIV
57 #endif
58 #endif
59 
60 #if defined(__x86_64__) || defined(_M_X64)
61 #define LIBDIVIDE_X86_64
62 #endif
63 
64 #if defined(__i386__)
65 #define LIBDIVIDE_i386
66 #endif
67 
68 #if defined(__GNUC__) || defined(__clang__)
69 #define LIBDIVIDE_GCC_STYLE_ASM
70 #endif
71 
72 #if defined(__cplusplus) || defined(LIBDIVIDE_VC)
73 #define LIBDIVIDE_FUNCTION __FUNCTION__
74 #else
75 #define LIBDIVIDE_FUNCTION __func__
76 #endif
77 
78 // Set up forced inlining if possible.
79 // We need both the attribute and keyword to avoid "might not be inlineable" warnings.
80 #ifdef __has_attribute
81 #if __has_attribute(always_inline)
82 #define LIBDIVIDE_INLINE __attribute__((always_inline)) inline
83 #endif
84 #endif
85 #ifndef LIBDIVIDE_INLINE
86 #define LIBDIVIDE_INLINE inline
87 #endif
88 
89 #if defined(__AVR__)
90 #define LIBDIVIDE_ERROR(msg)
91 #else
92 #define LIBDIVIDE_ERROR(msg)                                                                     \
93     do {                                                                                         \
94         fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", __LINE__, LIBDIVIDE_FUNCTION, msg); \
95         abort();                                                                                 \
96     } while (0)
97 #endif
98 
99 #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__)
100 #define LIBDIVIDE_ASSERT(x)                                                           \
101     do {                                                                              \
102         if (!(x)) {                                                                   \
103             fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", __LINE__, \
104                 LIBDIVIDE_FUNCTION, #x);                                              \
105             abort();                                                                  \
106         }                                                                             \
107     } while (0)
108 #else
109 #define LIBDIVIDE_ASSERT(x)
110 #endif
111 
112 #ifdef __cplusplus
113 namespace libdivide {
114 #endif
115 
116 // pack divider structs to prevent compilers from padding.
117 // This reduces memory usage by up to 43% when using a large
118 // array of libdivide dividers and improves performance
119 // by up to 10% because of reduced memory bandwidth.
120 #pragma pack(push, 1)
121 
122 struct libdivide_u16_t {
123     uint16_t magic;
124     uint8_t more;
125 };
126 
127 struct libdivide_s16_t {
128     int16_t magic;
129     uint8_t more;
130 };
131 
132 struct libdivide_u32_t {
133     uint32_t magic;
134     uint8_t more;
135 };
136 
137 struct libdivide_s32_t {
138     int32_t magic;
139     uint8_t more;
140 };
141 
142 struct libdivide_u64_t {
143     uint64_t magic;
144     uint8_t more;
145 };
146 
147 struct libdivide_s64_t {
148     int64_t magic;
149     uint8_t more;
150 };
151 
152 struct libdivide_u16_branchfree_t {
153     uint16_t magic;
154     uint8_t more;
155 };
156 
157 struct libdivide_s16_branchfree_t {
158     int16_t magic;
159     uint8_t more;
160 };
161 
162 struct libdivide_u32_branchfree_t {
163     uint32_t magic;
164     uint8_t more;
165 };
166 
167 struct libdivide_s32_branchfree_t {
168     int32_t magic;
169     uint8_t more;
170 };
171 
172 struct libdivide_u64_branchfree_t {
173     uint64_t magic;
174     uint8_t more;
175 };
176 
177 struct libdivide_s64_branchfree_t {
178     int64_t magic;
179     uint8_t more;
180 };
181 
182 #pragma pack(pop)
183 
184 // Explanation of the "more" field:
185 //
186 // * Bits 0-5 is the shift value (for shift path or mult path).
187 // * Bit 6 is the add indicator for mult path.
188 // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative
189 //   divisor indicator so that we can efficiently use sign extension to
190 //   create a bitmask with all bits set to 1 (if the divisor is negative)
191 //   or 0 (if the divisor is positive).
192 //
193 // u32: [0-4] shift value
194 //      [5] ignored
195 //      [6] add indicator
196 //      magic number of 0 indicates shift path
197 //
198 // s32: [0-4] shift value
199 //      [5] ignored
200 //      [6] add indicator
201 //      [7] indicates negative divisor
202 //      magic number of 0 indicates shift path
203 //
204 // u64: [0-5] shift value
205 //      [6] add indicator
206 //      magic number of 0 indicates shift path
207 //
208 // s64: [0-5] shift value
209 //      [6] add indicator
210 //      [7] indicates negative divisor
211 //      magic number of 0 indicates shift path
212 //
213 // In s32 and s64 branchfree modes, the magic number is negated according to
214 // whether the divisor is negated. In branchfree strategy, it is not negated.
215 
216 enum {
217     LIBDIVIDE_16_SHIFT_MASK = 0x1F,
218     LIBDIVIDE_32_SHIFT_MASK = 0x1F,
219     LIBDIVIDE_64_SHIFT_MASK = 0x3F,
220     LIBDIVIDE_ADD_MARKER = 0x40,
221     LIBDIVIDE_NEGATIVE_DIVISOR = 0x80
222 };
223 
224 static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d);
225 static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d);
226 static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d);
227 static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d);
228 static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d);
229 static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d);
230 
231 static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d);
232 static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d);
233 static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d);
234 static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d);
235 static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d);
236 static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d);
237 
238 static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
239     int16_t numer, int16_t magic, uint8_t more);
240 static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
241     int16_t numer, const struct libdivide_s16_t* denom);
242 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
243     uint16_t numer, uint16_t magic, uint8_t more);
244 static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
245     uint16_t numer, const struct libdivide_u16_t* denom);
246 static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
247     int32_t numer, const struct libdivide_s32_t *denom);
248 static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
249     uint32_t numer, const struct libdivide_u32_t *denom);
250 static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
251     int64_t numer, const struct libdivide_s64_t *denom);
252 static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
253     uint64_t numer, const struct libdivide_u64_t *denom);
254 
255 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
256     int16_t numer, const struct libdivide_s16_branchfree_t* denom);
257 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
258     uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
259 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
260     int32_t numer, const struct libdivide_s32_branchfree_t *denom);
261 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
262     uint32_t numer, const struct libdivide_u32_branchfree_t *denom);
263 static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
264     int64_t numer, const struct libdivide_s64_branchfree_t *denom);
265 static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
266     uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
267 
268 static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
269 static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
270 static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
271 static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
272 static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
273 static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
274 
275 static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
276     const struct libdivide_s16_branchfree_t* denom);
277 static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
278     const struct libdivide_u16_branchfree_t* denom);
279 static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
280     const struct libdivide_s32_branchfree_t *denom);
281 static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
282     const struct libdivide_u32_branchfree_t *denom);
283 static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(
284     const struct libdivide_s64_branchfree_t *denom);
285 static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(
286     const struct libdivide_u64_branchfree_t *denom);
287 
288 //////// Internal Utility Functions
289 
libdivide_mullhi_u16(uint16_t x,uint16_t y)290 static LIBDIVIDE_INLINE uint16_t libdivide_mullhi_u16(uint16_t x, uint16_t y) {
291     uint32_t xl = x, yl = y;
292     uint32_t rl = xl * yl;
293     return (uint16_t)(rl >> 16);
294 }
295 
libdivide_mullhi_s16(int16_t x,int16_t y)296 static LIBDIVIDE_INLINE int16_t libdivide_mullhi_s16(int16_t x, int16_t y) {
297     int32_t xl = x, yl = y;
298     int32_t rl = xl * yl;
299     // needs to be arithmetic shift
300     return (int16_t)(rl >> 16);
301 }
302 
libdivide_mullhi_u32(uint32_t x,uint32_t y)303 static LIBDIVIDE_INLINE uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) {
304     uint64_t xl = x, yl = y;
305     uint64_t rl = xl * yl;
306     return (uint32_t)(rl >> 32);
307 }
308 
libdivide_mullhi_s32(int32_t x,int32_t y)309 static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
310     int64_t xl = x, yl = y;
311     int64_t rl = xl * yl;
312     // needs to be arithmetic shift
313     return (int32_t)(rl >> 32);
314 }
315 
libdivide_mullhi_u64(uint64_t x,uint64_t y)316 static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
317 #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
318     return __umulh(x, y);
319 #elif defined(HAS_INT128_T)
320     __uint128_t xl = x, yl = y;
321     __uint128_t rl = xl * yl;
322     return (uint64_t)(rl >> 64);
323 #else
324     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
325     uint32_t mask = 0xFFFFFFFF;
326     uint32_t x0 = (uint32_t)(x & mask);
327     uint32_t x1 = (uint32_t)(x >> 32);
328     uint32_t y0 = (uint32_t)(y & mask);
329     uint32_t y1 = (uint32_t)(y >> 32);
330     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
331     uint64_t x0y1 = x0 * (uint64_t)y1;
332     uint64_t x1y0 = x1 * (uint64_t)y0;
333     uint64_t x1y1 = x1 * (uint64_t)y1;
334     uint64_t temp = x1y0 + x0y0_hi;
335     uint64_t temp_lo = temp & mask;
336     uint64_t temp_hi = temp >> 32;
337 
338     return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32);
339 #endif
340 }
341 
libdivide_mullhi_s64(int64_t x,int64_t y)342 static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
343 #if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
344     return __mulh(x, y);
345 #elif defined(HAS_INT128_T)
346     __int128_t xl = x, yl = y;
347     __int128_t rl = xl * yl;
348     return (int64_t)(rl >> 64);
349 #else
350     // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64)
351     uint32_t mask = 0xFFFFFFFF;
352     uint32_t x0 = (uint32_t)(x & mask);
353     uint32_t y0 = (uint32_t)(y & mask);
354     int32_t x1 = (int32_t)(x >> 32);
355     int32_t y1 = (int32_t)(y >> 32);
356     uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0);
357     int64_t t = x1 * (int64_t)y0 + x0y0_hi;
358     int64_t w1 = x0 * (int64_t)y1 + (t & mask);
359 
360     return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32);
361 #endif
362 }
363 
libdivide_count_leading_zeros16(uint16_t val)364 static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
365 #if defined(__AVR__)
366     // Fast way to count leading zeros
367     // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
368     return __builtin_clz(val);
369 #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
370     // Fast way to count leading zeros
371     return __builtin_clz(val) - 16;
372 #elif defined(LIBDIVIDE_VC)
373     unsigned long result;
374     if (_BitScanReverse(&result, (unsigned long)val)) {
375         return (int16_t)(15 - result);
376     }
377     return 0;
378 #else
379     if (val == 0) return 16;
380     int16_t result = 4;
381     uint16_t hi = 0xFU << 12;
382     while ((val & hi) == 0) {
383         hi >>= 4;
384         result += 4;
385     }
386     while (val & hi) {
387         result -= 1;
388         hi <<= 1;
389     }
390     return result;
391 #endif
392 }
393 
libdivide_count_leading_zeros32(uint32_t val)394 static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
395 #if defined(__AVR__)
396    // Fast way to count leading zeros
397     return __builtin_clzl(val);
398 #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
399     // Fast way to count leading zeros
400     return __builtin_clz(val);
401 #elif defined(LIBDIVIDE_VC)
402     unsigned long result;
403     if (_BitScanReverse(&result, val)) {
404         return 31 - result;
405     }
406     return 0;
407 #else
408     if (val == 0) return 32;
409     int32_t result = 8;
410     uint32_t hi = 0xFFU << 24;
411     while ((val & hi) == 0) {
412         hi >>= 8;
413         result += 8;
414     }
415     while (val & hi) {
416         result -= 1;
417         hi <<= 1;
418     }
419     return result;
420 #endif
421 }
422 
libdivide_count_leading_zeros64(uint64_t val)423 static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
424 #if defined(__GNUC__) || __has_builtin(__builtin_clzll)
425     // Fast way to count leading zeros
426     return __builtin_clzll(val);
427 #elif defined(LIBDIVIDE_VC) && defined(_WIN64)
428     unsigned long result;
429     if (_BitScanReverse64(&result, val)) {
430         return 63 - result;
431     }
432     return 0;
433 #else
434     uint32_t hi = val >> 32;
435     uint32_t lo = val & 0xFFFFFFFF;
436     if (hi != 0) return libdivide_count_leading_zeros32(hi);
437     return 32 + libdivide_count_leading_zeros32(lo);
438 #endif
439 }
440 
441 // libdivide_32_div_16_to_16: divides a 32-bit uint {u1, u0} by a 16-bit
442 // uint {v}. The result must fit in 16 bits.
443 // Returns the quotient directly and the remainder in *r
libdivide_32_div_16_to_16(uint16_t u1,uint16_t u0,uint16_t v,uint16_t * r)444 static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
445     uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
446     uint32_t n = ((uint32_t)u1 << 16) | u0;
447     uint16_t result = (uint16_t)(n / v);
448     *r = (uint16_t)(n - result * (uint32_t)v);
449     return result;
450 }
451 
452 // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit
453 // uint {v}. The result must fit in 32 bits.
454 // Returns the quotient directly and the remainder in *r
libdivide_64_div_32_to_32(uint32_t u1,uint32_t u0,uint32_t v,uint32_t * r)455 static LIBDIVIDE_INLINE uint32_t libdivide_64_div_32_to_32(
456     uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) {
457 #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && defined(LIBDIVIDE_GCC_STYLE_ASM)
458     uint32_t result;
459     __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1));
460     return result;
461 #else
462     uint64_t n = ((uint64_t)u1 << 32) | u0;
463     uint32_t result = (uint32_t)(n / v);
464     *r = (uint32_t)(n - result * (uint64_t)v);
465     return result;
466 #endif
467 }
468 
469 // libdivide_128_div_64_to_64: divides a 128-bit uint {numhi, numlo} by a 64-bit uint {den}. The
470 // result must fit in 64 bits. Returns the quotient directly and the remainder in *r
libdivide_128_div_64_to_64(uint64_t numhi,uint64_t numlo,uint64_t den,uint64_t * r)471 static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
472     uint64_t numhi, uint64_t numlo, uint64_t den, uint64_t *r) {
473     // N.B. resist the temptation to use __uint128_t here.
474     // In LLVM compiler-rt, it performs a 128/128 -> 128 division which is many times slower than
475     // necessary. In gcc it's better but still slower than the divlu implementation, perhaps because
476     // it's not LIBDIVIDE_INLINEd.
477 #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
478     uint64_t result;
479     __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
480     return result;
481 #else
482     // We work in base 2**32.
483     // A uint32 holds a single digit. A uint64 holds two digits.
484     // Our numerator is conceptually [num3, num2, num1, num0].
485     // Our denominator is [den1, den0].
486     const uint64_t b = ((uint64_t)1 << 32);
487 
488     // The high and low digits of our computed quotient.
489     uint32_t q1;
490     uint32_t q0;
491 
492     // The normalization shift factor.
493     int shift;
494 
495     // The high and low digits of our denominator (after normalizing).
496     // Also the low 2 digits of our numerator (after normalizing).
497     uint32_t den1;
498     uint32_t den0;
499     uint32_t num1;
500     uint32_t num0;
501 
502     // A partial remainder.
503     uint64_t rem;
504 
505     // The estimated quotient, and its corresponding remainder (unrelated to true remainder).
506     uint64_t qhat;
507     uint64_t rhat;
508 
509     // Variables used to correct the estimated quotient.
510     uint64_t c1;
511     uint64_t c2;
512 
513     // Check for overflow and divide by 0.
514     if (numhi >= den) {
515         if (r != NULL) *r = ~0ull;
516         return ~0ull;
517     }
518 
519     // Determine the normalization factor. We multiply den by this, so that its leading digit is at
520     // least half b. In binary this means just shifting left by the number of leading zeros, so that
521     // there's a 1 in the MSB.
522     // We also shift numer by the same amount. This cannot overflow because numhi < den.
523     // The expression (-shift & 63) is the same as (64 - shift), except it avoids the UB of shifting
524     // by 64. The funny bitwise 'and' ensures that numlo does not get shifted into numhi if shift is
525     // 0. clang 11 has an x86 codegen bug here: see LLVM bug 50118. The sequence below avoids it.
526     shift = libdivide_count_leading_zeros64(den);
527     den <<= shift;
528     numhi <<= shift;
529     numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
530     numlo <<= shift;
531 
532     // Extract the low digits of the numerator and both digits of the denominator.
533     num1 = (uint32_t)(numlo >> 32);
534     num0 = (uint32_t)(numlo & 0xFFFFFFFFu);
535     den1 = (uint32_t)(den >> 32);
536     den0 = (uint32_t)(den & 0xFFFFFFFFu);
537 
538     // We wish to compute q1 = [n3 n2 n1] / [d1 d0].
539     // Estimate q1 as [n3 n2] / [d1], and then correct it.
540     // Note while qhat may be 2 digits, q1 is always 1 digit.
541     qhat = numhi / den1;
542     rhat = numhi % den1;
543     c1 = qhat * den0;
544     c2 = rhat * b + num1;
545     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
546     q1 = (uint32_t)qhat;
547 
548     // Compute the true (partial) remainder.
549     rem = numhi * b + num1 - q1 * den;
550 
551     // We wish to compute q0 = [rem1 rem0 n0] / [d1 d0].
552     // Estimate q0 as [rem1 rem0] / [d1] and correct it.
553     qhat = rem / den1;
554     rhat = rem % den1;
555     c1 = qhat * den0;
556     c2 = rhat * b + num0;
557     if (c1 > c2) qhat -= (c1 - c2 > den) ? 2 : 1;
558     q0 = (uint32_t)qhat;
559 
560     // Return remainder if requested.
561     if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
562     return ((uint64_t)q1 << 32) | q0;
563 #endif
564 }
565 
566 // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
libdivide_u128_shift(uint64_t * u1,uint64_t * u0,int32_t signed_shift)567 static LIBDIVIDE_INLINE void libdivide_u128_shift(
568     uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
569     if (signed_shift > 0) {
570         uint32_t shift = signed_shift;
571         *u1 <<= shift;
572         *u1 |= *u0 >> (64 - shift);
573         *u0 <<= shift;
574     } else if (signed_shift < 0) {
575         uint32_t shift = -signed_shift;
576         *u0 >>= shift;
577         *u0 |= *u1 << (64 - shift);
578         *u1 >>= shift;
579     }
580 }
581 
582 // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
libdivide_128_div_128_to_64(uint64_t u_hi,uint64_t u_lo,uint64_t v_hi,uint64_t v_lo,uint64_t * r_hi,uint64_t * r_lo)583 static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
584     uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
585 #if defined(HAS_INT128_T) && defined(HAS_INT128_DIV)
586     __uint128_t ufull = u_hi;
587     __uint128_t vfull = v_hi;
588     ufull = (ufull << 64) | u_lo;
589     vfull = (vfull << 64) | v_lo;
590     uint64_t res = (uint64_t)(ufull / vfull);
591     __uint128_t remainder = ufull - (vfull * res);
592     *r_lo = (uint64_t)remainder;
593     *r_hi = (uint64_t)(remainder >> 64);
594     return res;
595 #else
596     // Adapted from "Unsigned Doubleword Division" in Hacker's Delight
597     // We want to compute u / v
598     typedef struct {
599         uint64_t hi;
600         uint64_t lo;
601     } u128_t;
602     u128_t u = {u_hi, u_lo};
603     u128_t v = {v_hi, v_lo};
604 
605     if (v.hi == 0) {
606         // divisor v is a 64 bit value, so we just need one 128/64 division
607         // Note that we are simpler than Hacker's Delight here, because we know
608         // the quotient fits in 64 bits whereas Hacker's Delight demands a full
609         // 128 bit quotient
610         *r_hi = 0;
611         return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo);
612     }
613     // Here v >= 2**64
614     // We know that v.hi != 0, so count leading zeros is OK
615     // We have 0 <= n <= 63
616     uint32_t n = libdivide_count_leading_zeros64(v.hi);
617 
618     // Normalize the divisor so its MSB is 1
619     u128_t v1t = v;
620     libdivide_u128_shift(&v1t.hi, &v1t.lo, n);
621     uint64_t v1 = v1t.hi;  // i.e. v1 = v1t >> 64
622 
623     // To ensure no overflow
624     u128_t u1 = u;
625     libdivide_u128_shift(&u1.hi, &u1.lo, -1);
626 
627     // Get quotient from divide unsigned insn.
628     uint64_t rem_ignored;
629     uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored);
630 
631     // Undo normalization and division of u by 2.
632     u128_t q0 = {0, q1};
633     libdivide_u128_shift(&q0.hi, &q0.lo, n);
634     libdivide_u128_shift(&q0.hi, &q0.lo, -63);
635 
636     // Make q0 correct or too small by 1
637     // Equivalent to `if (q0 != 0) q0 = q0 - 1;`
638     if (q0.hi != 0 || q0.lo != 0) {
639         q0.hi -= (q0.lo == 0);  // borrow
640         q0.lo -= 1;
641     }
642 
643     // Now q0 is correct.
644     // Compute q0 * v as q0v
645     // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo)
646     // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) +
647     //   (q0.lo * v.hi <<  64) + q0.lo * v.lo)
648     // Each term is 128 bit
649     // High half of full product (upper 128 bits!) are dropped
650     u128_t q0v = {0, 0};
651     q0v.hi = q0.hi * v.lo + q0.lo * v.hi + libdivide_mullhi_u64(q0.lo, v.lo);
652     q0v.lo = q0.lo * v.lo;
653 
654     // Compute u - q0v as u_q0v
655     // This is the remainder
656     u128_t u_q0v = u;
657     u_q0v.hi -= q0v.hi + (u.lo < q0v.lo);  // second term is borrow
658     u_q0v.lo -= q0v.lo;
659 
660     // Check if u_q0v >= v
661     // This checks if our remainder is larger than the divisor
662     if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) {
663         // Increment q0
664         q0.lo += 1;
665         q0.hi += (q0.lo == 0);  // carry
666 
667         // Subtract v from remainder
668         u_q0v.hi -= v.hi + (u_q0v.lo < v.lo);
669         u_q0v.lo -= v.lo;
670     }
671 
672     *r_hi = u_q0v.hi;
673     *r_lo = u_q0v.lo;
674 
675     LIBDIVIDE_ASSERT(q0.hi == 0);
676     return q0.lo;
677 #endif
678 }
679 
680 ////////// UINT16
681 
libdivide_internal_u16_gen(uint16_t d,int branchfree)682 static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
683     uint16_t d, int branchfree) {
684     if (d == 0) {
685         LIBDIVIDE_ERROR("divider must be != 0");
686     }
687 
688     struct libdivide_u16_t result;
689     uint8_t floor_log_2_d = (uint8_t)(15 - libdivide_count_leading_zeros16(d));
690 
691     // Power of 2
692     if ((d & (d - 1)) == 0) {
693         // We need to subtract 1 from the shift value in case of an unsigned
694         // branchfree divider because there is a hardcoded right shift by 1
695         // in its division algorithm. Because of this we also need to add back
696         // 1 in its recovery algorithm.
697         result.magic = 0;
698         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
699     }
700     else {
701         uint8_t more;
702         uint16_t rem, proposed_m;
703         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
704 
705         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
706         const uint16_t e = d - rem;
707 
708         // This power works if e < 2**floor_log_2_d.
709         if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
710             // This power works
711             more = floor_log_2_d;
712         }
713         else {
714             // We have to use the general 17-bit algorithm.  We need to compute
715             // (2**power) / d. However, we already have (2**(power-1))/d and
716             // its remainder.  By doubling both, and then correcting the
717             // remainder, we can compute the larger division.
718             // don't care about overflow here - in fact, we expect it
719             proposed_m += proposed_m;
720             const uint16_t twice_rem = rem + rem;
721             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
722             more = floor_log_2_d | LIBDIVIDE_ADD_MARKER;
723         }
724         result.magic = 1 + proposed_m;
725         result.more = more;
726         // result.more's shift should in general be ceil_log_2_d. But if we
727         // used the smaller power, we subtract one from the shift because we're
728         // using the smaller power. If we're using the larger power, we
729         // subtract one from the shift because it's taken care of by the add
730         // indicator. So floor_log_2_d happens to be correct in both cases.
731     }
732     return result;
733 }
734 
libdivide_u16_gen(uint16_t d)735 struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
736     return libdivide_internal_u16_gen(d, 0);
737 }
738 
libdivide_u16_branchfree_gen(uint16_t d)739 struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
740     if (d == 1) {
741         LIBDIVIDE_ERROR("branchfree divider must be != 1");
742     }
743     struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
744     struct libdivide_u16_branchfree_t ret = {
745         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
746     return ret;
747 }
748 
749 // The original libdivide_u16_do takes a const pointer. However, this cannot be used
750 // with a compile time constant libdivide_u16_t: it will generate a warning about
751 // taking the address of a temporary. Hence this overload.
libdivide_u16_do_raw(uint16_t numer,uint16_t magic,uint8_t more)752 uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
753     if (!magic) {
754         return numer >> more;
755     }
756     else {
757         uint16_t q = libdivide_mullhi_u16(magic, numer);
758         if (more & LIBDIVIDE_ADD_MARKER) {
759             uint16_t t = ((numer - q) >> 1) + q;
760             return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
761         }
762         else {
763             // All upper bits are 0,
764             // don't need to mask them off.
765             return q >> more;
766         }
767     }
768 }
769 
libdivide_u16_do(uint16_t numer,const struct libdivide_u16_t * denom)770 uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
771     return libdivide_u16_do_raw(numer, denom->magic, denom->more);
772 }
773 
libdivide_u16_branchfree_do(uint16_t numer,const struct libdivide_u16_branchfree_t * denom)774 uint16_t libdivide_u16_branchfree_do(
775     uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
776     uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
777     uint16_t t = ((numer - q) >> 1) + q;
778     return t >> denom->more;
779 }
780 
libdivide_u16_recover(const struct libdivide_u16_t * denom)781 uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
782     uint8_t more = denom->more;
783     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
784 
785     if (!denom->magic) {
786         return (uint16_t)1 << shift;
787     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
788         // We compute q = n/d = n*m / 2^(16 + shift)
789         // Therefore we have d = 2^(16 + shift) / m
790         // We need to ceil it.
791         // We know d is not a power of 2, so m is not a power of 2,
792         // so we can just add 1 to the floor
793         uint16_t hi_dividend = (uint16_t)1 << shift;
794         uint16_t rem_ignored;
795         return 1 + libdivide_32_div_16_to_16(hi_dividend, 0, denom->magic, &rem_ignored);
796     } else {
797         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
798         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
799         // Also note that shift may be as high as 15, so shift + 1 will
800         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
801         // then double the quotient and remainder.
802         uint32_t half_n = (uint32_t)1 << (16 + shift);
803         uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
804         // Note that the quotient is guaranteed <= 16 bits, but the remainder
805         // may need 17!
806         uint16_t half_q = (uint16_t)(half_n / d);
807         uint32_t rem = half_n % d;
808         // We computed 2^(16+shift)/(m+2^16)
809         // Need to double it, and then add 1 to the quotient if doubling th
810         // remainder would increase the quotient.
811         // Note that rem<<1 cannot overflow, since rem < d and d is 17 bits
812         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
813 
814         // We rounded down in gen (hence +1)
815         return full_q + 1;
816     }
817 }
818 
libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t * denom)819 uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
820     uint8_t more = denom->more;
821     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
822 
823     if (!denom->magic) {
824         return (uint16_t)1 << (shift + 1);
825     } else {
826         // Here we wish to compute d = 2^(16+shift+1)/(m+2^16).
827         // Notice (m + 2^16) is a 17 bit number. Use 32 bit division for now
828         // Also note that shift may be as high as 15, so shift + 1 will
829         // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
830         // then double the quotient and remainder.
831         uint32_t half_n = (uint32_t)1 << (16 + shift);
832         uint32_t d = ((uint32_t)1 << 16) | denom->magic;
833         // Note that the quotient is guaranteed <= 16 bits, but the remainder
834         // may need 17!
835         uint16_t half_q = (uint16_t)(half_n / d);
836         uint32_t rem = half_n % d;
837         // We computed 2^(16+shift)/(m+2^16)
838         // Need to double it, and then add 1 to the quotient if doubling th
839         // remainder would increase the quotient.
840         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
841         uint16_t full_q = half_q + half_q + ((rem << 1) >= d);
842 
843         // We rounded down in gen (hence +1)
844         return full_q + 1;
845     }
846 }
847 
848 ////////// UINT32
849 
libdivide_internal_u32_gen(uint32_t d,int branchfree)850 static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
851     uint32_t d, int branchfree) {
852     if (d == 0) {
853         LIBDIVIDE_ERROR("divider must be != 0");
854     }
855 
856     struct libdivide_u32_t result;
857     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d);
858 
859     // Power of 2
860     if ((d & (d - 1)) == 0) {
861         // We need to subtract 1 from the shift value in case of an unsigned
862         // branchfree divider because there is a hardcoded right shift by 1
863         // in its division algorithm. Because of this we also need to add back
864         // 1 in its recovery algorithm.
865         result.magic = 0;
866         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
867     } else {
868         uint8_t more;
869         uint32_t rem, proposed_m;
870         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << floor_log_2_d, 0, d, &rem);
871 
872         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
873         const uint32_t e = d - rem;
874 
875         // This power works if e < 2**floor_log_2_d.
876         if (!branchfree && (e < ((uint32_t)1 << floor_log_2_d))) {
877             // This power works
878             more = (uint8_t)floor_log_2_d;
879         } else {
880             // We have to use the general 33-bit algorithm.  We need to compute
881             // (2**power) / d. However, we already have (2**(power-1))/d and
882             // its remainder.  By doubling both, and then correcting the
883             // remainder, we can compute the larger division.
884             // don't care about overflow here - in fact, we expect it
885             proposed_m += proposed_m;
886             const uint32_t twice_rem = rem + rem;
887             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
888             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
889         }
890         result.magic = 1 + proposed_m;
891         result.more = more;
892         // result.more's shift should in general be ceil_log_2_d. But if we
893         // used the smaller power, we subtract one from the shift because we're
894         // using the smaller power. If we're using the larger power, we
895         // subtract one from the shift because it's taken care of by the add
896         // indicator. So floor_log_2_d happens to be correct in both cases.
897     }
898     return result;
899 }
900 
libdivide_u32_gen(uint32_t d)901 struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
902     return libdivide_internal_u32_gen(d, 0);
903 }
904 
libdivide_u32_branchfree_gen(uint32_t d)905 struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
906     if (d == 1) {
907         LIBDIVIDE_ERROR("branchfree divider must be != 1");
908     }
909     struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1);
910     struct libdivide_u32_branchfree_t ret = {
911         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)};
912     return ret;
913 }
914 
libdivide_u32_do(uint32_t numer,const struct libdivide_u32_t * denom)915 uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
916     uint8_t more = denom->more;
917     if (!denom->magic) {
918         return numer >> more;
919     } else {
920         uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
921         if (more & LIBDIVIDE_ADD_MARKER) {
922             uint32_t t = ((numer - q) >> 1) + q;
923             return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
924         } else {
925             // All upper bits are 0,
926             // don't need to mask them off.
927             return q >> more;
928         }
929     }
930 }
931 
libdivide_u32_branchfree_do(uint32_t numer,const struct libdivide_u32_branchfree_t * denom)932 uint32_t libdivide_u32_branchfree_do(
933     uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
934     uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
935     uint32_t t = ((numer - q) >> 1) + q;
936     return t >> denom->more;
937 }
938 
libdivide_u32_recover(const struct libdivide_u32_t * denom)939 uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
940     uint8_t more = denom->more;
941     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
942 
943     if (!denom->magic) {
944         return (uint32_t)1 << shift;
945     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
946         // We compute q = n/d = n*m / 2^(32 + shift)
947         // Therefore we have d = 2^(32 + shift) / m
948         // We need to ceil it.
949         // We know d is not a power of 2, so m is not a power of 2,
950         // so we can just add 1 to the floor
951         uint32_t hi_dividend = (uint32_t)1 << shift;
952         uint32_t rem_ignored;
953         return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored);
954     } else {
955         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
956         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
957         // Also note that shift may be as high as 31, so shift + 1 will
958         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
959         // then double the quotient and remainder.
960         uint64_t half_n = (uint64_t)1 << (32 + shift);
961         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
962         // Note that the quotient is guaranteed <= 32 bits, but the remainder
963         // may need 33!
964         uint32_t half_q = (uint32_t)(half_n / d);
965         uint64_t rem = half_n % d;
966         // We computed 2^(32+shift)/(m+2^32)
967         // Need to double it, and then add 1 to the quotient if doubling th
968         // remainder would increase the quotient.
969         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
970         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
971 
972         // We rounded down in gen (hence +1)
973         return full_q + 1;
974     }
975 }
976 
libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t * denom)977 uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
978     uint8_t more = denom->more;
979     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
980 
981     if (!denom->magic) {
982         return (uint32_t)1 << (shift + 1);
983     } else {
984         // Here we wish to compute d = 2^(32+shift+1)/(m+2^32).
985         // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now
986         // Also note that shift may be as high as 31, so shift + 1 will
987         // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and
988         // then double the quotient and remainder.
989         uint64_t half_n = (uint64_t)1 << (32 + shift);
990         uint64_t d = ((uint64_t)1 << 32) | denom->magic;
991         // Note that the quotient is guaranteed <= 32 bits, but the remainder
992         // may need 33!
993         uint32_t half_q = (uint32_t)(half_n / d);
994         uint64_t rem = half_n % d;
995         // We computed 2^(32+shift)/(m+2^32)
996         // Need to double it, and then add 1 to the quotient if doubling th
997         // remainder would increase the quotient.
998         // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits
999         uint32_t full_q = half_q + half_q + ((rem << 1) >= d);
1000 
1001         // We rounded down in gen (hence +1)
1002         return full_q + 1;
1003     }
1004 }
1005 
1006 /////////// UINT64
1007 
libdivide_internal_u64_gen(uint64_t d,int branchfree)1008 static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
1009     uint64_t d, int branchfree) {
1010     if (d == 0) {
1011         LIBDIVIDE_ERROR("divider must be != 0");
1012     }
1013 
1014     struct libdivide_u64_t result;
1015     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d);
1016 
1017     // Power of 2
1018     if ((d & (d - 1)) == 0) {
1019         // We need to subtract 1 from the shift value in case of an unsigned
1020         // branchfree divider because there is a hardcoded right shift by 1
1021         // in its division algorithm. Because of this we also need to add back
1022         // 1 in its recovery algorithm.
1023         result.magic = 0;
1024         result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
1025     } else {
1026         uint64_t proposed_m, rem;
1027         uint8_t more;
1028         // (1 << (64 + floor_log_2_d)) / d
1029         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << floor_log_2_d, 0, d, &rem);
1030 
1031         LIBDIVIDE_ASSERT(rem > 0 && rem < d);
1032         const uint64_t e = d - rem;
1033 
1034         // This power works if e < 2**floor_log_2_d.
1035         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1036             // This power works
1037             more = (uint8_t)floor_log_2_d;
1038         } else {
1039             // We have to use the general 65-bit algorithm.  We need to compute
1040             // (2**power) / d. However, we already have (2**(power-1))/d and
1041             // its remainder. By doubling both, and then correcting the
1042             // remainder, we can compute the larger division.
1043             // don't care about overflow here - in fact, we expect it
1044             proposed_m += proposed_m;
1045             const uint64_t twice_rem = rem + rem;
1046             if (twice_rem >= d || twice_rem < rem) proposed_m += 1;
1047             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1048         }
1049         result.magic = 1 + proposed_m;
1050         result.more = more;
1051         // result.more's shift should in general be ceil_log_2_d. But if we
1052         // used the smaller power, we subtract one from the shift because we're
1053         // using the smaller power. If we're using the larger power, we
1054         // subtract one from the shift because it's taken care of by the add
1055         // indicator. So floor_log_2_d happens to be correct in both cases,
1056         // which is why we do it outside of the if statement.
1057     }
1058     return result;
1059 }
1060 
libdivide_u64_gen(uint64_t d)1061 struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
1062     return libdivide_internal_u64_gen(d, 0);
1063 }
1064 
libdivide_u64_branchfree_gen(uint64_t d)1065 struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
1066     if (d == 1) {
1067         LIBDIVIDE_ERROR("branchfree divider must be != 1");
1068     }
1069     struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1);
1070     struct libdivide_u64_branchfree_t ret = {
1071         tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)};
1072     return ret;
1073 }
1074 
libdivide_u64_do(uint64_t numer,const struct libdivide_u64_t * denom)1075 uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
1076     uint8_t more = denom->more;
1077     if (!denom->magic) {
1078         return numer >> more;
1079     } else {
1080         uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
1081         if (more & LIBDIVIDE_ADD_MARKER) {
1082             uint64_t t = ((numer - q) >> 1) + q;
1083             return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
1084         } else {
1085             // All upper bits are 0,
1086             // don't need to mask them off.
1087             return q >> more;
1088         }
1089     }
1090 }
1091 
libdivide_u64_branchfree_do(uint64_t numer,const struct libdivide_u64_branchfree_t * denom)1092 uint64_t libdivide_u64_branchfree_do(
1093     uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
1094     uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
1095     uint64_t t = ((numer - q) >> 1) + q;
1096     return t >> denom->more;
1097 }
1098 
libdivide_u64_recover(const struct libdivide_u64_t * denom)1099 uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
1100     uint8_t more = denom->more;
1101     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1102 
1103     if (!denom->magic) {
1104         return (uint64_t)1 << shift;
1105     } else if (!(more & LIBDIVIDE_ADD_MARKER)) {
1106         // We compute q = n/d = n*m / 2^(64 + shift)
1107         // Therefore we have d = 2^(64 + shift) / m
1108         // We need to ceil it.
1109         // We know d is not a power of 2, so m is not a power of 2,
1110         // so we can just add 1 to the floor
1111         uint64_t hi_dividend = (uint64_t)1 << shift;
1112         uint64_t rem_ignored;
1113         return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored);
1114     } else {
1115         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1116         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1117         // libdivide_u32_recover for more on what we do here.
1118         // TODO: do something better than 128 bit math
1119 
1120         // Full n is a (potentially) 129 bit value
1121         // half_n is a 128 bit value
1122         // Compute the hi half of half_n. Low half is 0.
1123         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1124         // d is a 65 bit value. The high bit is always set to 1.
1125         const uint64_t d_hi = 1, d_lo = denom->magic;
1126         // Note that the quotient is guaranteed <= 64 bits,
1127         // but the remainder may need 65!
1128         uint64_t r_hi, r_lo;
1129         uint64_t half_q =
1130             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1131         // We computed 2^(64+shift)/(m+2^64)
1132         // Double the remainder ('dr') and check if that is larger than d
1133         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1134         // cannot overflow
1135         uint64_t dr_lo = r_lo + r_lo;
1136         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
1137         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1138         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1139         return full_q + 1;
1140     }
1141 }
1142 
libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t * denom)1143 uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
1144     uint8_t more = denom->more;
1145     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1146 
1147     if (!denom->magic) {
1148         return (uint64_t)1 << (shift + 1);
1149     } else {
1150         // Here we wish to compute d = 2^(64+shift+1)/(m+2^64).
1151         // Notice (m + 2^64) is a 65 bit number. This gets hairy. See
1152         // libdivide_u32_recover for more on what we do here.
1153         // TODO: do something better than 128 bit math
1154 
1155         // Full n is a (potentially) 129 bit value
1156         // half_n is a 128 bit value
1157         // Compute the hi half of half_n. Low half is 0.
1158         uint64_t half_n_hi = (uint64_t)1 << shift, half_n_lo = 0;
1159         // d is a 65 bit value. The high bit is always set to 1.
1160         const uint64_t d_hi = 1, d_lo = denom->magic;
1161         // Note that the quotient is guaranteed <= 64 bits,
1162         // but the remainder may need 65!
1163         uint64_t r_hi, r_lo;
1164         uint64_t half_q =
1165             libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo);
1166         // We computed 2^(64+shift)/(m+2^64)
1167         // Double the remainder ('dr') and check if that is larger than d
1168         // Note that d is a 65 bit value, so r1 is small and so r1 + r1
1169         // cannot overflow
1170         uint64_t dr_lo = r_lo + r_lo;
1171         uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo);  // last term is carry
1172         int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo);
1173         uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0);
1174         return full_q + 1;
1175     }
1176 }
1177 
1178 /////////// SINT16
1179 
libdivide_internal_s16_gen(int16_t d,int branchfree)1180 static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
1181     int16_t d, int branchfree) {
1182     if (d == 0) {
1183         LIBDIVIDE_ERROR("divider must be != 0");
1184     }
1185 
1186     struct libdivide_s16_t result;
1187 
1188     // If d is a power of 2, or negative a power of 2, we have to use a shift.
1189     // This is especially important because the magic algorithm fails for -1.
1190     // To check if d is a power of 2 or its inverse, it suffices to check
1191     // whether its absolute value has exactly one bit set. This works even for
1192     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1193     // and is a power of 2.
1194     uint16_t ud = (uint16_t)d;
1195     uint16_t absD = (d < 0) ? -ud : ud;
1196     uint16_t floor_log_2_d = 15 - libdivide_count_leading_zeros16(absD);
1197     // check if exactly one bit is set,
1198     // don't care if absD is 0 since that's divide by zero
1199     if ((absD & (absD - 1)) == 0) {
1200         // Branchfree and normal paths are exactly the same
1201         result.magic = 0;
1202         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1203     } else {
1204         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1205 
1206         uint8_t more;
1207         // the dividend here is 2**(floor_log_2_d + 31), so the low 16 bit word
1208         // is 0 and the high word is floor_log_2_d - 1
1209         uint16_t rem, proposed_m;
1210         proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1211         const uint16_t e = absD - rem;
1212 
1213         // We are going to start with a power of floor_log_2_d - 1.
1214         // This works if works if e < 2**floor_log_2_d.
1215         if (!branchfree && e < ((uint16_t)1 << floor_log_2_d)) {
1216             // This power works
1217             more = (uint8_t)(floor_log_2_d - 1);
1218         } else {
1219             // We need to go one higher. This should not make proposed_m
1220             // overflow, but it will make it negative when interpreted as an
1221             // int16_t.
1222             proposed_m += proposed_m;
1223             const uint16_t twice_rem = rem + rem;
1224             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1225             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1226         }
1227 
1228         proposed_m += 1;
1229         int16_t magic = (int16_t)proposed_m;
1230 
1231         // Mark if we are negative. Note we only negate the magic number in the
1232         // branchfull case.
1233         if (d < 0) {
1234             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1235             if (!branchfree) {
1236                 magic = -magic;
1237             }
1238         }
1239 
1240         result.more = more;
1241         result.magic = magic;
1242     }
1243     return result;
1244 }
1245 
libdivide_s16_gen(int16_t d)1246 struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
1247     return libdivide_internal_s16_gen(d, 0);
1248 }
1249 
libdivide_s16_branchfree_gen(int16_t d)1250 struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
1251     struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
1252     struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
1253     return result;
1254 }
1255 
1256 // The original libdivide_s16_do takes a const pointer. However, this cannot be used
1257 // with a compile time constant libdivide_s16_t: it will generate a warning about
1258 // taking the address of a temporary. Hence this overload.
libdivide_s16_do_raw(int16_t numer,int16_t magic,uint8_t more)1259 int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
1260     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1261 
1262     if (!magic) {
1263         uint16_t sign = (int8_t)more >> 7;
1264         uint16_t mask = ((uint16_t)1 << shift) - 1;
1265         uint16_t uq = numer + ((numer >> 15) & mask);
1266         int16_t q = (int16_t)uq;
1267         q >>= shift;
1268         q = (q ^ sign) - sign;
1269         return q;
1270     } else {
1271         uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
1272         if (more & LIBDIVIDE_ADD_MARKER) {
1273             // must be arithmetic shift and then sign extend
1274             int16_t sign = (int8_t)more >> 7;
1275             // q += (more < 0 ? -numer : numer)
1276             // cast required to avoid UB
1277             uq += ((uint16_t)numer ^ sign) - sign;
1278         }
1279         int16_t q = (int16_t)uq;
1280         q >>= shift;
1281         q += (q < 0);
1282         return q;
1283     }
1284 }
1285 
libdivide_s16_do(int16_t numer,const struct libdivide_s16_t * denom)1286 int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
1287     return libdivide_s16_do_raw(numer, denom->magic, denom->more);
1288 }
1289 
libdivide_s16_branchfree_do(int16_t numer,const struct libdivide_s16_branchfree_t * denom)1290 int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
1291     uint8_t more = denom->more;
1292     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1293     // must be arithmetic shift and then sign extend
1294     int16_t sign = (int8_t)more >> 7;
1295     int16_t magic = denom->magic;
1296     int16_t q = libdivide_mullhi_s16(magic, numer);
1297     q += numer;
1298 
1299     // If q is non-negative, we have nothing to do
1300     // If q is negative, we want to add either (2**shift)-1 if d is a power of
1301     // 2, or (2**shift) if it is not a power of 2
1302     uint16_t is_power_of_2 = (magic == 0);
1303     uint16_t q_sign = (uint16_t)(q >> 15);
1304     q += q_sign & (((uint16_t)1 << shift) - is_power_of_2);
1305 
1306     // Now arithmetic right shift
1307     q >>= shift;
1308     // Negate if needed
1309     q = (q ^ sign) - sign;
1310 
1311     return q;
1312 }
1313 
libdivide_s16_recover(const struct libdivide_s16_t * denom)1314 int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
1315     uint8_t more = denom->more;
1316     uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
1317     if (!denom->magic) {
1318         uint16_t absD = (uint16_t)1 << shift;
1319         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1320             absD = -absD;
1321         }
1322         return (int16_t)absD;
1323     } else {
1324         // Unsigned math is much easier
1325         // We negate the magic number only in the branchfull case, and we don't
1326         // know which case we're in. However we have enough information to
1327         // determine the correct sign of the magic number. The divisor was
1328         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1329         // the magic number's sign is opposite that of the divisor.
1330         // We want to compute the positive magic number.
1331         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1332         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1333 
1334         // Handle the power of 2 case (including branchfree)
1335         if (denom->magic == 0) {
1336             int16_t result = (uint16_t)1 << shift;
1337             return negative_divisor ? -result : result;
1338         }
1339 
1340         uint16_t d = (uint16_t)(magic_was_negated ? -denom->magic : denom->magic);
1341         uint32_t n = (uint32_t)1 << (16 + shift);  // this shift cannot exceed 30
1342         uint16_t q = (uint16_t)(n / d);
1343         int16_t result = (int16_t)q;
1344         result += 1;
1345         return negative_divisor ? -result : result;
1346     }
1347 }
1348 
libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t * denom)1349 int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
1350     return libdivide_s16_recover((const struct libdivide_s16_t *)denom);
1351 }
1352 
1353 /////////// SINT32
1354 
libdivide_internal_s32_gen(int32_t d,int branchfree)1355 static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
1356     int32_t d, int branchfree) {
1357     if (d == 0) {
1358         LIBDIVIDE_ERROR("divider must be != 0");
1359     }
1360 
1361     struct libdivide_s32_t result;
1362 
1363     // If d is a power of 2, or negative a power of 2, we have to use a shift.
1364     // This is especially important because the magic algorithm fails for -1.
1365     // To check if d is a power of 2 or its inverse, it suffices to check
1366     // whether its absolute value has exactly one bit set. This works even for
1367     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1368     // and is a power of 2.
1369     uint32_t ud = (uint32_t)d;
1370     uint32_t absD = (d < 0) ? -ud : ud;
1371     uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD);
1372     // check if exactly one bit is set,
1373     // don't care if absD is 0 since that's divide by zero
1374     if ((absD & (absD - 1)) == 0) {
1375         // Branchfree and normal paths are exactly the same
1376         result.magic = 0;
1377         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1378     } else {
1379         LIBDIVIDE_ASSERT(floor_log_2_d >= 1);
1380 
1381         uint8_t more;
1382         // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word
1383         // is 0 and the high word is floor_log_2_d - 1
1384         uint32_t rem, proposed_m;
1385         proposed_m = libdivide_64_div_32_to_32((uint32_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1386         const uint32_t e = absD - rem;
1387 
1388         // We are going to start with a power of floor_log_2_d - 1.
1389         // This works if works if e < 2**floor_log_2_d.
1390         if (!branchfree && e < ((uint32_t)1 << floor_log_2_d)) {
1391             // This power works
1392             more = (uint8_t)(floor_log_2_d - 1);
1393         } else {
1394             // We need to go one higher. This should not make proposed_m
1395             // overflow, but it will make it negative when interpreted as an
1396             // int32_t.
1397             proposed_m += proposed_m;
1398             const uint32_t twice_rem = rem + rem;
1399             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1400             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1401         }
1402 
1403         proposed_m += 1;
1404         int32_t magic = (int32_t)proposed_m;
1405 
1406         // Mark if we are negative. Note we only negate the magic number in the
1407         // branchfull case.
1408         if (d < 0) {
1409             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1410             if (!branchfree) {
1411                 magic = -magic;
1412             }
1413         }
1414 
1415         result.more = more;
1416         result.magic = magic;
1417     }
1418     return result;
1419 }
1420 
libdivide_s32_gen(int32_t d)1421 struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
1422     return libdivide_internal_s32_gen(d, 0);
1423 }
1424 
libdivide_s32_branchfree_gen(int32_t d)1425 struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
1426     struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
1427     struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
1428     return result;
1429 }
1430 
libdivide_s32_do(int32_t numer,const struct libdivide_s32_t * denom)1431 int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
1432     uint8_t more = denom->more;
1433     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1434 
1435     if (!denom->magic) {
1436         uint32_t sign = (int8_t)more >> 7;
1437         uint32_t mask = ((uint32_t)1 << shift) - 1;
1438         uint32_t uq = numer + ((numer >> 31) & mask);
1439         int32_t q = (int32_t)uq;
1440         q >>= shift;
1441         q = (q ^ sign) - sign;
1442         return q;
1443     } else {
1444         uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
1445         if (more & LIBDIVIDE_ADD_MARKER) {
1446             // must be arithmetic shift and then sign extend
1447             int32_t sign = (int8_t)more >> 7;
1448             // q += (more < 0 ? -numer : numer)
1449             // cast required to avoid UB
1450             uq += ((uint32_t)numer ^ sign) - sign;
1451         }
1452         int32_t q = (int32_t)uq;
1453         q >>= shift;
1454         q += (q < 0);
1455         return q;
1456     }
1457 }
1458 
libdivide_s32_branchfree_do(int32_t numer,const struct libdivide_s32_branchfree_t * denom)1459 int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
1460     uint8_t more = denom->more;
1461     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1462     // must be arithmetic shift and then sign extend
1463     int32_t sign = (int8_t)more >> 7;
1464     int32_t magic = denom->magic;
1465     int32_t q = libdivide_mullhi_s32(magic, numer);
1466     q += numer;
1467 
1468     // If q is non-negative, we have nothing to do
1469     // If q is negative, we want to add either (2**shift)-1 if d is a power of
1470     // 2, or (2**shift) if it is not a power of 2
1471     uint32_t is_power_of_2 = (magic == 0);
1472     uint32_t q_sign = (uint32_t)(q >> 31);
1473     q += q_sign & (((uint32_t)1 << shift) - is_power_of_2);
1474 
1475     // Now arithmetic right shift
1476     q >>= shift;
1477     // Negate if needed
1478     q = (q ^ sign) - sign;
1479 
1480     return q;
1481 }
1482 
libdivide_s32_recover(const struct libdivide_s32_t * denom)1483 int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
1484     uint8_t more = denom->more;
1485     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1486     if (!denom->magic) {
1487         uint32_t absD = (uint32_t)1 << shift;
1488         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1489             absD = -absD;
1490         }
1491         return (int32_t)absD;
1492     } else {
1493         // Unsigned math is much easier
1494         // We negate the magic number only in the branchfull case, and we don't
1495         // know which case we're in. However we have enough information to
1496         // determine the correct sign of the magic number. The divisor was
1497         // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set,
1498         // the magic number's sign is opposite that of the divisor.
1499         // We want to compute the positive magic number.
1500         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1501         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1502 
1503         // Handle the power of 2 case (including branchfree)
1504         if (denom->magic == 0) {
1505             int32_t result = (uint32_t)1 << shift;
1506             return negative_divisor ? -result : result;
1507         }
1508 
1509         uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic);
1510         uint64_t n = (uint64_t)1 << (32 + shift);  // this shift cannot exceed 30
1511         uint32_t q = (uint32_t)(n / d);
1512         int32_t result = (int32_t)q;
1513         result += 1;
1514         return negative_divisor ? -result : result;
1515     }
1516 }
1517 
libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t * denom)1518 int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
1519     return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
1520 }
1521 
1522 ///////////// SINT64
1523 
libdivide_internal_s64_gen(int64_t d,int branchfree)1524 static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
1525     int64_t d, int branchfree) {
1526     if (d == 0) {
1527         LIBDIVIDE_ERROR("divider must be != 0");
1528     }
1529 
1530     struct libdivide_s64_t result;
1531 
1532     // If d is a power of 2, or negative a power of 2, we have to use a shift.
1533     // This is especially important because the magic algorithm fails for -1.
1534     // To check if d is a power of 2 or its inverse, it suffices to check
1535     // whether its absolute value has exactly one bit set.  This works even for
1536     // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set
1537     // and is a power of 2.
1538     uint64_t ud = (uint64_t)d;
1539     uint64_t absD = (d < 0) ? -ud : ud;
1540     uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD);
1541     // check if exactly one bit is set,
1542     // don't care if absD is 0 since that's divide by zero
1543     if ((absD & (absD - 1)) == 0) {
1544         // Branchfree and non-branchfree cases are the same
1545         result.magic = 0;
1546         result.more = (uint8_t)(floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0));
1547     } else {
1548         // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word
1549         // is 0 and the high word is floor_log_2_d - 1
1550         uint8_t more;
1551         uint64_t rem, proposed_m;
1552         proposed_m = libdivide_128_div_64_to_64((uint64_t)1 << (floor_log_2_d - 1), 0, absD, &rem);
1553         const uint64_t e = absD - rem;
1554 
1555         // We are going to start with a power of floor_log_2_d - 1.
1556         // This works if works if e < 2**floor_log_2_d.
1557         if (!branchfree && e < ((uint64_t)1 << floor_log_2_d)) {
1558             // This power works
1559             more = (uint8_t)(floor_log_2_d - 1);
1560         } else {
1561             // We need to go one higher. This should not make proposed_m
1562             // overflow, but it will make it negative when interpreted as an
1563             // int32_t.
1564             proposed_m += proposed_m;
1565             const uint64_t twice_rem = rem + rem;
1566             if (twice_rem >= absD || twice_rem < rem) proposed_m += 1;
1567             // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we
1568             // also set ADD_MARKER this is an annoying optimization that
1569             // enables algorithm #4 to avoid the mask. However we always set it
1570             // in the branchfree case
1571             more = (uint8_t)(floor_log_2_d | LIBDIVIDE_ADD_MARKER);
1572         }
1573         proposed_m += 1;
1574         int64_t magic = (int64_t)proposed_m;
1575 
1576         // Mark if we are negative
1577         if (d < 0) {
1578             more |= LIBDIVIDE_NEGATIVE_DIVISOR;
1579             if (!branchfree) {
1580                 magic = -magic;
1581             }
1582         }
1583 
1584         result.more = more;
1585         result.magic = magic;
1586     }
1587     return result;
1588 }
1589 
libdivide_s64_gen(int64_t d)1590 struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
1591     return libdivide_internal_s64_gen(d, 0);
1592 }
1593 
libdivide_s64_branchfree_gen(int64_t d)1594 struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
1595     struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
1596     struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
1597     return ret;
1598 }
1599 
libdivide_s64_do(int64_t numer,const struct libdivide_s64_t * denom)1600 int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
1601     uint8_t more = denom->more;
1602     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1603 
1604     if (!denom->magic) {  // shift path
1605         uint64_t mask = ((uint64_t)1 << shift) - 1;
1606         uint64_t uq = numer + ((numer >> 63) & mask);
1607         int64_t q = (int64_t)uq;
1608         q >>= shift;
1609         // must be arithmetic shift and then sign-extend
1610         int64_t sign = (int8_t)more >> 7;
1611         q = (q ^ sign) - sign;
1612         return q;
1613     } else {
1614         uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
1615         if (more & LIBDIVIDE_ADD_MARKER) {
1616             // must be arithmetic shift and then sign extend
1617             int64_t sign = (int8_t)more >> 7;
1618             // q += (more < 0 ? -numer : numer)
1619             // cast required to avoid UB
1620             uq += ((uint64_t)numer ^ sign) - sign;
1621         }
1622         int64_t q = (int64_t)uq;
1623         q >>= shift;
1624         q += (q < 0);
1625         return q;
1626     }
1627 }
1628 
libdivide_s64_branchfree_do(int64_t numer,const struct libdivide_s64_branchfree_t * denom)1629 int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
1630     uint8_t more = denom->more;
1631     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1632     // must be arithmetic shift and then sign extend
1633     int64_t sign = (int8_t)more >> 7;
1634     int64_t magic = denom->magic;
1635     int64_t q = libdivide_mullhi_s64(magic, numer);
1636     q += numer;
1637 
1638     // If q is non-negative, we have nothing to do.
1639     // If q is negative, we want to add either (2**shift)-1 if d is a power of
1640     // 2, or (2**shift) if it is not a power of 2.
1641     uint64_t is_power_of_2 = (magic == 0);
1642     uint64_t q_sign = (uint64_t)(q >> 63);
1643     q += q_sign & (((uint64_t)1 << shift) - is_power_of_2);
1644 
1645     // Arithmetic right shift
1646     q >>= shift;
1647     // Negate if needed
1648     q = (q ^ sign) - sign;
1649 
1650     return q;
1651 }
1652 
libdivide_s64_recover(const struct libdivide_s64_t * denom)1653 int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
1654     uint8_t more = denom->more;
1655     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1656     if (denom->magic == 0) {  // shift path
1657         uint64_t absD = (uint64_t)1 << shift;
1658         if (more & LIBDIVIDE_NEGATIVE_DIVISOR) {
1659             absD = -absD;
1660         }
1661         return (int64_t)absD;
1662     } else {
1663         // Unsigned math is much easier
1664         int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR);
1665         int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0;
1666 
1667         uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic);
1668         uint64_t n_hi = (uint64_t)1 << shift, n_lo = 0;
1669         uint64_t rem_ignored;
1670         uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored);
1671         int64_t result = (int64_t)(q + 1);
1672         if (negative_divisor) {
1673             result = -result;
1674         }
1675         return result;
1676     }
1677 }
1678 
libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t * denom)1679 int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
1680     return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
1681 }
1682 
1683 // Simplest possible vector type division: treat the vector type as an array
1684 // of underlying native type.
1685 #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
1686     const size_t count = sizeof(VecT) / sizeof(IntT); \
1687     VecT result; \
1688     IntT *pSource = (IntT *)&numers; \
1689     IntT *pTarget = (IntT *)&result; \
1690     for (size_t loop=0; loop<count; ++loop) { \
1691         pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
1692     } \
1693     return result; \
1694 
1695 #if defined(LIBDIVIDE_NEON)
1696 
1697 static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_do_vec128(
1698     uint16x8_t numers, const struct libdivide_u16_t *denom);
1699 static LIBDIVIDE_INLINE int16x8_t libdivide_s16_do_vec128(
1700     int16x8_t numers, const struct libdivide_s16_t *denom);
1701 static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_do_vec128(
1702     uint32x4_t numers, const struct libdivide_u32_t *denom);
1703 static LIBDIVIDE_INLINE int32x4_t libdivide_s32_do_vec128(
1704     int32x4_t numers, const struct libdivide_s32_t *denom);
1705 static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_do_vec128(
1706     uint64x2_t numers, const struct libdivide_u64_t *denom);
1707 static LIBDIVIDE_INLINE int64x2_t libdivide_s64_do_vec128(
1708     int64x2_t numers, const struct libdivide_s64_t *denom);
1709 
1710 static LIBDIVIDE_INLINE uint16x8_t libdivide_u16_branchfree_do_vec128(
1711     uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom);
1712 static LIBDIVIDE_INLINE int16x8_t libdivide_s16_branchfree_do_vec128(
1713     int16x8_t numers, const struct libdivide_s16_branchfree_t *denom);
1714 static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_branchfree_do_vec128(
1715     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom);
1716 static LIBDIVIDE_INLINE int32x4_t libdivide_s32_branchfree_do_vec128(
1717     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom);
1718 static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_branchfree_do_vec128(
1719     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom);
1720 static LIBDIVIDE_INLINE int64x2_t libdivide_s64_branchfree_do_vec128(
1721     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom);
1722 
1723 //////// Internal Utility Functions
1724 
1725 // Logical right shift by runtime value.
1726 // NEON implements right shift as left shits by negative values.
libdivide_u32_neon_srl(uint32x4_t v,uint8_t amt)1727 static LIBDIVIDE_INLINE uint32x4_t libdivide_u32_neon_srl(uint32x4_t v, uint8_t amt) {
1728     int32_t wamt = (int32_t)(amt);
1729     return vshlq_u32(v, vdupq_n_s32(-wamt));
1730 }
1731 
libdivide_u64_neon_srl(uint64x2_t v,uint8_t amt)1732 static LIBDIVIDE_INLINE uint64x2_t libdivide_u64_neon_srl(uint64x2_t v, uint8_t amt) {
1733     int64_t wamt = (int64_t)(amt);
1734     return vshlq_u64(v, vdupq_n_s64(-wamt));
1735 }
1736 
1737 // Arithmetic right shift by runtime value.
libdivide_s32_neon_sra(int32x4_t v,uint8_t amt)1738 static LIBDIVIDE_INLINE int32x4_t libdivide_s32_neon_sra(int32x4_t v, uint8_t amt) {
1739     int32_t wamt = (int32_t)(amt);
1740     return vshlq_s32(v, vdupq_n_s32(-wamt));
1741 }
1742 
libdivide_s64_neon_sra(int64x2_t v,uint8_t amt)1743 static LIBDIVIDE_INLINE int64x2_t libdivide_s64_neon_sra(int64x2_t v, uint8_t amt) {
1744     int64_t wamt = (int64_t)(amt);
1745     return vshlq_s64(v, vdupq_n_s64(-wamt));
1746 }
1747 
libdivide_s64_signbits(int64x2_t v)1748 static LIBDIVIDE_INLINE int64x2_t libdivide_s64_signbits(int64x2_t v) { return vshrq_n_s64(v, 63); }
1749 
libdivide_mullhi_u32_vec128(uint32x4_t a,uint32_t b)1750 static LIBDIVIDE_INLINE uint32x4_t libdivide_mullhi_u32_vec128(uint32x4_t a, uint32_t b) {
1751     // Desire is [x0, x1, x2, x3]
1752     uint32x4_t w1 = vreinterpretq_u32_u64(vmull_n_u32(vget_low_u32(a), b));  // [_, x0, _, x1]
1753     uint32x4_t w2 = vreinterpretq_u32_u64(vmull_high_n_u32(a, b));           //[_, x2, _, x3]
1754     return vuzp2q_u32(w1, w2);                                               // [x0, x1, x2, x3]
1755 }
1756 
libdivide_mullhi_s32_vec128(int32x4_t a,int32_t b)1757 static LIBDIVIDE_INLINE int32x4_t libdivide_mullhi_s32_vec128(int32x4_t a, int32_t b) {
1758     int32x4_t w1 = vreinterpretq_s32_s64(vmull_n_s32(vget_low_s32(a), b));  // [_, x0, _, x1]
1759     int32x4_t w2 = vreinterpretq_s32_s64(vmull_high_n_s32(a, b));           //[_, x2, _, x3]
1760     return vuzp2q_s32(w1, w2);                                              // [x0, x1, x2, x3]
1761 }
1762 
libdivide_mullhi_u64_vec128(uint64x2_t x,uint64_t sy)1763 static LIBDIVIDE_INLINE uint64x2_t libdivide_mullhi_u64_vec128(uint64x2_t x, uint64_t sy) {
1764     // full 128 bits product is:
1765     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
1766     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
1767 
1768     // Get low and high words. x0 contains low 32 bits, x1 is high 32 bits.
1769     uint64x2_t y = vdupq_n_u64(sy);
1770     uint32x2_t x0 = vmovn_u64(x);
1771     uint32x2_t y0 = vmovn_u64(y);
1772     uint32x2_t x1 = vshrn_n_u64(x, 32);
1773     uint32x2_t y1 = vshrn_n_u64(y, 32);
1774 
1775     // Compute x0*y0.
1776     uint64x2_t x0y0 = vmull_u32(x0, y0);
1777     uint64x2_t x0y0_hi = vshrq_n_u64(x0y0, 32);
1778 
1779     // Compute other intermediate products.
1780     uint64x2_t temp = vmlal_u32(x0y0_hi, x1, y0);  // temp = x0y0_hi + x1*y0;
1781     // We want to split temp into its low 32 bits and high 32 bits, both
1782     // in the low half of 64 bit registers.
1783     // Use shifts to avoid needing a reg for the mask.
1784     uint64x2_t temp_lo = vshrq_n_u64(vshlq_n_u64(temp, 32), 32);  // temp_lo = temp & 0xFFFFFFFF;
1785     uint64x2_t temp_hi = vshrq_n_u64(temp, 32);                   // temp_hi = temp >> 32;
1786 
1787     temp_lo = vmlal_u32(temp_lo, x0, y1);  // temp_lo += x0*y0
1788     temp_lo = vshrq_n_u64(temp_lo, 32);    // temp_lo >>= 32
1789     temp_hi = vmlal_u32(temp_hi, x1, y1);  // temp_hi += x1*y1
1790     uint64x2_t result = vaddq_u64(temp_hi, temp_lo);
1791     return result;
1792 }
1793 
libdivide_mullhi_s64_vec128(int64x2_t x,int64_t sy)1794 static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64_t sy) {
1795     int64x2_t p = vreinterpretq_s64_u64(
1796         libdivide_mullhi_u64_vec128(vreinterpretq_u64_s64(x), (uint64_t)(sy)));
1797     int64x2_t y = vdupq_n_s64(sy);
1798     int64x2_t t1 = vandq_s64(libdivide_s64_signbits(x), y);
1799     int64x2_t t2 = vandq_s64(libdivide_s64_signbits(y), x);
1800     p = vsubq_s64(p, t1);
1801     p = vsubq_s64(p, t2);
1802     return p;
1803 }
1804 
1805 ////////// UINT16
1806 
libdivide_u16_do_vec128(uint16x8_t numers,const struct libdivide_u16_t * denom)1807 uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
1808     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
1809 }
1810 
libdivide_u16_branchfree_do_vec128(uint16x8_t numers,const struct libdivide_u16_branchfree_t * denom)1811 uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
1812     SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
1813 }
1814 
1815 ////////// UINT32
1816 
libdivide_u32_do_vec128(uint32x4_t numers,const struct libdivide_u32_t * denom)1817 uint32x4_t libdivide_u32_do_vec128(uint32x4_t numers, const struct libdivide_u32_t *denom) {
1818     uint8_t more = denom->more;
1819     if (!denom->magic) {
1820         return libdivide_u32_neon_srl(numers, more);
1821     } else {
1822         uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1823         if (more & LIBDIVIDE_ADD_MARKER) {
1824             // uint32_t t = ((numer - q) >> 1) + q;
1825             // return t >> denom->shift;
1826             // Note we can use halving-subtract to avoid the shift.
1827             uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1828             uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1829             return libdivide_u32_neon_srl(t, shift);
1830         } else {
1831             return libdivide_u32_neon_srl(q, more);
1832         }
1833     }
1834 }
1835 
libdivide_u32_branchfree_do_vec128(uint32x4_t numers,const struct libdivide_u32_branchfree_t * denom)1836 uint32x4_t libdivide_u32_branchfree_do_vec128(
1837     uint32x4_t numers, const struct libdivide_u32_branchfree_t *denom) {
1838     uint32x4_t q = libdivide_mullhi_u32_vec128(numers, denom->magic);
1839     uint32x4_t t = vaddq_u32(vhsubq_u32(numers, q), q);
1840     return libdivide_u32_neon_srl(t, denom->more);
1841 }
1842 
1843 ////////// UINT64
1844 
libdivide_u64_do_vec128(uint64x2_t numers,const struct libdivide_u64_t * denom)1845 uint64x2_t libdivide_u64_do_vec128(uint64x2_t numers, const struct libdivide_u64_t *denom) {
1846     uint8_t more = denom->more;
1847     if (!denom->magic) {
1848         return libdivide_u64_neon_srl(numers, more);
1849     } else {
1850         uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1851         if (more & LIBDIVIDE_ADD_MARKER) {
1852             // uint32_t t = ((numer - q) >> 1) + q;
1853             // return t >> denom->shift;
1854             // No 64-bit halving subtracts in NEON :(
1855             uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1856             uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1857             return libdivide_u64_neon_srl(t, shift);
1858         } else {
1859             return libdivide_u64_neon_srl(q, more);
1860         }
1861     }
1862 }
1863 
libdivide_u64_branchfree_do_vec128(uint64x2_t numers,const struct libdivide_u64_branchfree_t * denom)1864 uint64x2_t libdivide_u64_branchfree_do_vec128(
1865     uint64x2_t numers, const struct libdivide_u64_branchfree_t *denom) {
1866     uint64x2_t q = libdivide_mullhi_u64_vec128(numers, denom->magic);
1867     uint64x2_t t = vaddq_u64(vshrq_n_u64(vsubq_u64(numers, q), 1), q);
1868     return libdivide_u64_neon_srl(t, denom->more);
1869 }
1870 
1871 ////////// SINT16
1872 
libdivide_s16_do_vec128(int16x8_t numers,const struct libdivide_s16_t * denom)1873 int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
1874     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
1875 }
1876 
libdivide_s16_branchfree_do_vec128(int16x8_t numers,const struct libdivide_s16_branchfree_t * denom)1877 int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
1878     SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
1879 }
1880 
1881 ////////// SINT32
1882 
libdivide_s32_do_vec128(int32x4_t numers,const struct libdivide_s32_t * denom)1883 int32x4_t libdivide_s32_do_vec128(int32x4_t numers, const struct libdivide_s32_t *denom) {
1884     uint8_t more = denom->more;
1885     if (!denom->magic) {
1886         uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1887         uint32_t mask = ((uint32_t)1 << shift) - 1;
1888         int32x4_t roundToZeroTweak = vdupq_n_s32((int)mask);
1889         // q = numer + ((numer >> 31) & roundToZeroTweak);
1890         int32x4_t q = vaddq_s32(numers, vandq_s32(vshrq_n_s32(numers, 31), roundToZeroTweak));
1891         q = libdivide_s32_neon_sra(q, shift);
1892         int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
1893         // q = (q ^ sign) - sign;
1894         q = vsubq_s32(veorq_s32(q, sign), sign);
1895         return q;
1896     } else {
1897         int32x4_t q = libdivide_mullhi_s32_vec128(numers, denom->magic);
1898         if (more & LIBDIVIDE_ADD_MARKER) {
1899             // must be arithmetic shift
1900             int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
1901             // q += ((numer ^ sign) - sign);
1902             q = vaddq_s32(q, vsubq_s32(veorq_s32(numers, sign), sign));
1903         }
1904         // q >>= shift
1905         q = libdivide_s32_neon_sra(q, more & LIBDIVIDE_32_SHIFT_MASK);
1906         q = vaddq_s32(
1907             q, vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_s32(q), 31)));  // q += (q < 0)
1908         return q;
1909     }
1910 }
1911 
libdivide_s32_branchfree_do_vec128(int32x4_t numers,const struct libdivide_s32_branchfree_t * denom)1912 int32x4_t libdivide_s32_branchfree_do_vec128(
1913     int32x4_t numers, const struct libdivide_s32_branchfree_t *denom) {
1914     int32_t magic = denom->magic;
1915     uint8_t more = denom->more;
1916     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
1917     // must be arithmetic shift
1918     int32x4_t sign = vdupq_n_s32((int8_t)more >> 7);
1919     int32x4_t q = libdivide_mullhi_s32_vec128(numers, magic);
1920     q = vaddq_s32(q, numers);  // q += numers
1921 
1922     // If q is non-negative, we have nothing to do
1923     // If q is negative, we want to add either (2**shift)-1 if d is
1924     // a power of 2, or (2**shift) if it is not a power of 2
1925     uint32_t is_power_of_2 = (magic == 0);
1926     int32x4_t q_sign = vshrq_n_s32(q, 31);  // q_sign = q >> 31
1927     int32x4_t mask = vdupq_n_s32(((uint32_t)1 << shift) - is_power_of_2);
1928     q = vaddq_s32(q, vandq_s32(q_sign, mask));  // q = q + (q_sign & mask)
1929     q = libdivide_s32_neon_sra(q, shift);       // q >>= shift
1930     q = vsubq_s32(veorq_s32(q, sign), sign);    // q = (q ^ sign) - sign
1931     return q;
1932 }
1933 
1934 ////////// SINT64
1935 
libdivide_s64_do_vec128(int64x2_t numers,const struct libdivide_s64_t * denom)1936 int64x2_t libdivide_s64_do_vec128(int64x2_t numers, const struct libdivide_s64_t *denom) {
1937     uint8_t more = denom->more;
1938     int64_t magic = denom->magic;
1939     if (magic == 0) {  // shift path
1940         uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1941         uint64_t mask = ((uint64_t)1 << shift) - 1;
1942         int64x2_t roundToZeroTweak = vdupq_n_s64(mask);  // TODO: no need to sign extend
1943         // q = numer + ((numer >> 63) & roundToZeroTweak);
1944         int64x2_t q =
1945             vaddq_s64(numers, vandq_s64(libdivide_s64_signbits(numers), roundToZeroTweak));
1946         q = libdivide_s64_neon_sra(q, shift);
1947         // q = (q ^ sign) - sign;
1948         int64x2_t sign = vreinterpretq_s64_s8(vdupq_n_s8((int8_t)more >> 7));
1949         q = vsubq_s64(veorq_s64(q, sign), sign);
1950         return q;
1951     } else {
1952         int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
1953         if (more & LIBDIVIDE_ADD_MARKER) {
1954             // must be arithmetic shift
1955             int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: no need to widen
1956             // q += ((numer ^ sign) - sign);
1957             q = vaddq_s64(q, vsubq_s64(veorq_s64(numers, sign), sign));
1958         }
1959         // q >>= denom->mult_path.shift
1960         q = libdivide_s64_neon_sra(q, more & LIBDIVIDE_64_SHIFT_MASK);
1961         q = vaddq_s64(
1962             q, vreinterpretq_s64_u64(vshrq_n_u64(vreinterpretq_u64_s64(q), 63)));  // q += (q < 0)
1963         return q;
1964     }
1965 }
1966 
libdivide_s64_branchfree_do_vec128(int64x2_t numers,const struct libdivide_s64_branchfree_t * denom)1967 int64x2_t libdivide_s64_branchfree_do_vec128(
1968     int64x2_t numers, const struct libdivide_s64_branchfree_t *denom) {
1969     int64_t magic = denom->magic;
1970     uint8_t more = denom->more;
1971     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
1972     // must be arithmetic shift
1973     int64x2_t sign = vdupq_n_s64((int8_t)more >> 7);  // TODO: avoid sign extend
1974 
1975     // libdivide_mullhi_s64(numers, magic);
1976     int64x2_t q = libdivide_mullhi_s64_vec128(numers, magic);
1977     q = vaddq_s64(q, numers);  // q += numers
1978 
1979     // If q is non-negative, we have nothing to do.
1980     // If q is negative, we want to add either (2**shift)-1 if d is
1981     // a power of 2, or (2**shift) if it is not a power of 2.
1982     uint32_t is_power_of_2 = (magic == 0);
1983     int64x2_t q_sign = libdivide_s64_signbits(q);  // q_sign = q >> 63
1984     int64x2_t mask = vdupq_n_s64(((uint64_t)1 << shift) - is_power_of_2);
1985     q = vaddq_s64(q, vandq_s64(q_sign, mask));  // q = q + (q_sign & mask)
1986     q = libdivide_s64_neon_sra(q, shift);       // q >>= shift
1987     q = vsubq_s64(veorq_s64(q, sign), sign);    // q = (q ^ sign) - sign
1988     return q;
1989 }
1990 
1991 #endif
1992 
1993 #if defined(LIBDIVIDE_AVX512)
1994 
1995 static LIBDIVIDE_INLINE __m512i libdivide_u16_do_vec512(
1996     __m512i numers, const struct libdivide_u16_t *denom);
1997 static LIBDIVIDE_INLINE __m512i libdivide_s16_do_vec512(
1998     __m512i numers, const struct libdivide_s16_t *denom);
1999 static LIBDIVIDE_INLINE __m512i libdivide_u32_do_vec512(
2000     __m512i numers, const struct libdivide_u32_t *denom);
2001 static LIBDIVIDE_INLINE __m512i libdivide_s32_do_vec512(
2002     __m512i numers, const struct libdivide_s32_t *denom);
2003 static LIBDIVIDE_INLINE __m512i libdivide_u64_do_vec512(
2004     __m512i numers, const struct libdivide_u64_t *denom);
2005 static LIBDIVIDE_INLINE __m512i libdivide_s64_do_vec512(
2006     __m512i numers, const struct libdivide_s64_t *denom);
2007 
2008 static LIBDIVIDE_INLINE __m512i libdivide_u16_branchfree_do_vec512(
2009     __m512i numers, const struct libdivide_u16_branchfree_t *denom);
2010 static LIBDIVIDE_INLINE __m512i libdivide_s16_branchfree_do_vec512(
2011     __m512i numers, const struct libdivide_s16_branchfree_t *denom);
2012 static LIBDIVIDE_INLINE __m512i libdivide_u32_branchfree_do_vec512(
2013     __m512i numers, const struct libdivide_u32_branchfree_t *denom);
2014 static LIBDIVIDE_INLINE __m512i libdivide_s32_branchfree_do_vec512(
2015     __m512i numers, const struct libdivide_s32_branchfree_t *denom);
2016 static LIBDIVIDE_INLINE __m512i libdivide_u64_branchfree_do_vec512(
2017     __m512i numers, const struct libdivide_u64_branchfree_t *denom);
2018 static LIBDIVIDE_INLINE __m512i libdivide_s64_branchfree_do_vec512(
2019     __m512i numers, const struct libdivide_s64_branchfree_t *denom);
2020 
2021 //////// Internal Utility Functions
2022 
libdivide_s64_signbits_vec512(__m512i v)2023 static LIBDIVIDE_INLINE __m512i libdivide_s64_signbits_vec512(__m512i v) {
2024     ;
2025     return _mm512_srai_epi64(v, 63);
2026 }
2027 
libdivide_s64_shift_right_vec512(__m512i v,int amt)2028 static LIBDIVIDE_INLINE __m512i libdivide_s64_shift_right_vec512(__m512i v, int amt) {
2029     return _mm512_srai_epi64(v, amt);
2030 }
2031 
2032 // Here, b is assumed to contain one 32-bit value repeated.
libdivide_mullhi_u32_vec512(__m512i a,__m512i b)2033 static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u32_vec512(__m512i a, __m512i b) {
2034     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32);
2035     __m512i a1X3X = _mm512_srli_epi64(a, 32);
2036     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2037     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask);
2038     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2039 }
2040 
2041 // b is one 32-bit value repeated.
libdivide_mullhi_s32_vec512(__m512i a,__m512i b)2042 static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s32_vec512(__m512i a, __m512i b) {
2043     __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32);
2044     __m512i a1X3X = _mm512_srli_epi64(a, 32);
2045     __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0);
2046     __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask);
2047     return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3);
2048 }
2049 
2050 // Here, y is assumed to contain one 64-bit value repeated.
libdivide_mullhi_u64_vec512(__m512i x,__m512i y)2051 static LIBDIVIDE_INLINE __m512i libdivide_mullhi_u64_vec512(__m512i x, __m512i y) {
2052     // see m128i variant for comments.
2053     __m512i x0y0 = _mm512_mul_epu32(x, y);
2054     __m512i x0y0_hi = _mm512_srli_epi64(x0y0, 32);
2055 
2056     __m512i x1 = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2057     __m512i y1 = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM)_MM_SHUFFLE(3, 3, 1, 1));
2058 
2059     __m512i x0y1 = _mm512_mul_epu32(x, y1);
2060     __m512i x1y0 = _mm512_mul_epu32(x1, y);
2061     __m512i x1y1 = _mm512_mul_epu32(x1, y1);
2062 
2063     __m512i mask = _mm512_set1_epi64(0xFFFFFFFF);
2064     __m512i temp = _mm512_add_epi64(x1y0, x0y0_hi);
2065     __m512i temp_lo = _mm512_and_si512(temp, mask);
2066     __m512i temp_hi = _mm512_srli_epi64(temp, 32);
2067 
2068     temp_lo = _mm512_srli_epi64(_mm512_add_epi64(temp_lo, x0y1), 32);
2069     temp_hi = _mm512_add_epi64(x1y1, temp_hi);
2070     return _mm512_add_epi64(temp_lo, temp_hi);
2071 }
2072 
2073 // y is one 64-bit value repeated.
libdivide_mullhi_s64_vec512(__m512i x,__m512i y)2074 static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y) {
2075     __m512i p = libdivide_mullhi_u64_vec512(x, y);
2076     __m512i t1 = _mm512_and_si512(libdivide_s64_signbits_vec512(x), y);
2077     __m512i t2 = _mm512_and_si512(libdivide_s64_signbits_vec512(y), x);
2078     p = _mm512_sub_epi64(p, t1);
2079     p = _mm512_sub_epi64(p, t2);
2080     return p;
2081 }
2082 
2083 ////////// UINT16
2084 
libdivide_u16_do_vec512(__m512i numers,const struct libdivide_u16_t * denom)2085 __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
2086     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
2087 }
2088 
libdivide_u16_branchfree_do_vec512(__m512i numers,const struct libdivide_u16_branchfree_t * denom)2089 __m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
2090     SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
2091 }
2092 
2093 ////////// UINT32
2094 
libdivide_u32_do_vec512(__m512i numers,const struct libdivide_u32_t * denom)2095 __m512i libdivide_u32_do_vec512(__m512i numers, const struct libdivide_u32_t *denom) {
2096     uint8_t more = denom->more;
2097     if (!denom->magic) {
2098         return _mm512_srli_epi32(numers, more);
2099     } else {
2100         __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2101         if (more & LIBDIVIDE_ADD_MARKER) {
2102             // uint32_t t = ((numer - q) >> 1) + q;
2103             // return t >> denom->shift;
2104             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2105             __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2106             return _mm512_srli_epi32(t, shift);
2107         } else {
2108             return _mm512_srli_epi32(q, more);
2109         }
2110     }
2111 }
2112 
libdivide_u32_branchfree_do_vec512(__m512i numers,const struct libdivide_u32_branchfree_t * denom)2113 __m512i libdivide_u32_branchfree_do_vec512(
2114     __m512i numers, const struct libdivide_u32_branchfree_t *denom) {
2115     __m512i q = libdivide_mullhi_u32_vec512(numers, _mm512_set1_epi32(denom->magic));
2116     __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q);
2117     return _mm512_srli_epi32(t, denom->more);
2118 }
2119 
2120 ////////// UINT64
2121 
libdivide_u64_do_vec512(__m512i numers,const struct libdivide_u64_t * denom)2122 __m512i libdivide_u64_do_vec512(__m512i numers, const struct libdivide_u64_t *denom) {
2123     uint8_t more = denom->more;
2124     if (!denom->magic) {
2125         return _mm512_srli_epi64(numers, more);
2126     } else {
2127         __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2128         if (more & LIBDIVIDE_ADD_MARKER) {
2129             // uint32_t t = ((numer - q) >> 1) + q;
2130             // return t >> denom->shift;
2131             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2132             __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2133             return _mm512_srli_epi64(t, shift);
2134         } else {
2135             return _mm512_srli_epi64(q, more);
2136         }
2137     }
2138 }
2139 
libdivide_u64_branchfree_do_vec512(__m512i numers,const struct libdivide_u64_branchfree_t * denom)2140 __m512i libdivide_u64_branchfree_do_vec512(
2141     __m512i numers, const struct libdivide_u64_branchfree_t *denom) {
2142     __m512i q = libdivide_mullhi_u64_vec512(numers, _mm512_set1_epi64(denom->magic));
2143     __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q);
2144     return _mm512_srli_epi64(t, denom->more);
2145 }
2146 
2147 ////////// SINT16
2148 
libdivide_s16_do_vec512(__m512i numers,const struct libdivide_s16_t * denom)2149 __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
2150     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
2151 }
2152 
libdivide_s16_branchfree_do_vec512(__m512i numers,const struct libdivide_s16_branchfree_t * denom)2153 __m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
2154     SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
2155 }
2156 
2157 ////////// SINT32
2158 
libdivide_s32_do_vec512(__m512i numers,const struct libdivide_s32_t * denom)2159 __m512i libdivide_s32_do_vec512(__m512i numers, const struct libdivide_s32_t *denom) {
2160     uint8_t more = denom->more;
2161     if (!denom->magic) {
2162         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2163         uint32_t mask = ((uint32_t)1 << shift) - 1;
2164         __m512i roundToZeroTweak = _mm512_set1_epi32(mask);
2165         // q = numer + ((numer >> 31) & roundToZeroTweak);
2166         __m512i q = _mm512_add_epi32(
2167             numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak));
2168         q = _mm512_srai_epi32(q, shift);
2169         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2170         // q = (q ^ sign) - sign;
2171         q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);
2172         return q;
2173     } else {
2174         __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(denom->magic));
2175         if (more & LIBDIVIDE_ADD_MARKER) {
2176             // must be arithmetic shift
2177             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2178             // q += ((numer ^ sign) - sign);
2179             q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign));
2180         }
2181         // q >>= shift
2182         q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2183         q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31));  // q += (q < 0)
2184         return q;
2185     }
2186 }
2187 
libdivide_s32_branchfree_do_vec512(__m512i numers,const struct libdivide_s32_branchfree_t * denom)2188 __m512i libdivide_s32_branchfree_do_vec512(
2189     __m512i numers, const struct libdivide_s32_branchfree_t *denom) {
2190     int32_t magic = denom->magic;
2191     uint8_t more = denom->more;
2192     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2193     // must be arithmetic shift
2194     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2195     __m512i q = libdivide_mullhi_s32_vec512(numers, _mm512_set1_epi32(magic));
2196     q = _mm512_add_epi32(q, numers);  // q += numers
2197 
2198     // If q is non-negative, we have nothing to do
2199     // If q is negative, we want to add either (2**shift)-1 if d is
2200     // a power of 2, or (2**shift) if it is not a power of 2
2201     uint32_t is_power_of_2 = (magic == 0);
2202     __m512i q_sign = _mm512_srai_epi32(q, 31);  // q_sign = q >> 31
2203     __m512i mask = _mm512_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2204     q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
2205     q = _mm512_srai_epi32(q, shift);                          // q >>= shift
2206     q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
2207     return q;
2208 }
2209 
2210 ////////// SINT64
2211 
libdivide_s64_do_vec512(__m512i numers,const struct libdivide_s64_t * denom)2212 __m512i libdivide_s64_do_vec512(__m512i numers, const struct libdivide_s64_t *denom) {
2213     uint8_t more = denom->more;
2214     int64_t magic = denom->magic;
2215     if (magic == 0) {  // shift path
2216         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2217         uint64_t mask = ((uint64_t)1 << shift) - 1;
2218         __m512i roundToZeroTweak = _mm512_set1_epi64(mask);
2219         // q = numer + ((numer >> 63) & roundToZeroTweak);
2220         __m512i q = _mm512_add_epi64(
2221             numers, _mm512_and_si512(libdivide_s64_signbits_vec512(numers), roundToZeroTweak));
2222         q = libdivide_s64_shift_right_vec512(q, shift);
2223         __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2224         // q = (q ^ sign) - sign;
2225         q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);
2226         return q;
2227     } else {
2228         __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2229         if (more & LIBDIVIDE_ADD_MARKER) {
2230             // must be arithmetic shift
2231             __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2232             // q += ((numer ^ sign) - sign);
2233             q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign));
2234         }
2235         // q >>= denom->mult_path.shift
2236         q = libdivide_s64_shift_right_vec512(q, more & LIBDIVIDE_64_SHIFT_MASK);
2237         q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63));  // q += (q < 0)
2238         return q;
2239     }
2240 }
2241 
libdivide_s64_branchfree_do_vec512(__m512i numers,const struct libdivide_s64_branchfree_t * denom)2242 __m512i libdivide_s64_branchfree_do_vec512(
2243     __m512i numers, const struct libdivide_s64_branchfree_t *denom) {
2244     int64_t magic = denom->magic;
2245     uint8_t more = denom->more;
2246     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2247     // must be arithmetic shift
2248     __m512i sign = _mm512_set1_epi32((int8_t)more >> 7);
2249 
2250     // libdivide_mullhi_s64(numers, magic);
2251     __m512i q = libdivide_mullhi_s64_vec512(numers, _mm512_set1_epi64(magic));
2252     q = _mm512_add_epi64(q, numers);  // q += numers
2253 
2254     // If q is non-negative, we have nothing to do.
2255     // If q is negative, we want to add either (2**shift)-1 if d is
2256     // a power of 2, or (2**shift) if it is not a power of 2.
2257     uint32_t is_power_of_2 = (magic == 0);
2258     __m512i q_sign = libdivide_s64_signbits_vec512(q);  // q_sign = q >> 63
2259     __m512i mask = _mm512_set1_epi64(((uint64_t)1 << shift) - is_power_of_2);
2260     q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask));  // q = q + (q_sign & mask)
2261     q = libdivide_s64_shift_right_vec512(q, shift);           // q >>= shift
2262     q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign);    // q = (q ^ sign) - sign
2263     return q;
2264 }
2265 
2266 #endif
2267 
2268 #if defined(LIBDIVIDE_AVX2)
2269 
2270 static LIBDIVIDE_INLINE __m256i libdivide_u16_do_vec256(
2271     __m256i numers, const struct libdivide_u16_t *denom);
2272 static LIBDIVIDE_INLINE __m256i libdivide_s16_do_vec256(
2273     __m256i numers, const struct libdivide_s16_t *denom);
2274 static LIBDIVIDE_INLINE __m256i libdivide_u32_do_vec256(
2275     __m256i numers, const struct libdivide_u32_t *denom);
2276 static LIBDIVIDE_INLINE __m256i libdivide_s32_do_vec256(
2277     __m256i numers, const struct libdivide_s32_t *denom);
2278 static LIBDIVIDE_INLINE __m256i libdivide_u64_do_vec256(
2279     __m256i numers, const struct libdivide_u64_t *denom);
2280 static LIBDIVIDE_INLINE __m256i libdivide_s64_do_vec256(
2281     __m256i numers, const struct libdivide_s64_t *denom);
2282 
2283 static LIBDIVIDE_INLINE __m256i libdivide_u16_branchfree_do_vec256(
2284     __m256i numers, const struct libdivide_u16_branchfree_t *denom);
2285 static LIBDIVIDE_INLINE __m256i libdivide_s16_branchfree_do_vec256(
2286     __m256i numers, const struct libdivide_s16_branchfree_t *denom);
2287 static LIBDIVIDE_INLINE __m256i libdivide_u32_branchfree_do_vec256(
2288     __m256i numers, const struct libdivide_u32_branchfree_t *denom);
2289 static LIBDIVIDE_INLINE __m256i libdivide_s32_branchfree_do_vec256(
2290     __m256i numers, const struct libdivide_s32_branchfree_t *denom);
2291 static LIBDIVIDE_INLINE __m256i libdivide_u64_branchfree_do_vec256(
2292     __m256i numers, const struct libdivide_u64_branchfree_t *denom);
2293 static LIBDIVIDE_INLINE __m256i libdivide_s64_branchfree_do_vec256(
2294     __m256i numers, const struct libdivide_s64_branchfree_t *denom);
2295 
2296 //////// Internal Utility Functions
2297 
2298 // Implementation of _mm256_srai_epi64(v, 63) (from AVX512).
libdivide_s64_signbits_vec256(__m256i v)2299 static LIBDIVIDE_INLINE __m256i libdivide_s64_signbits_vec256(__m256i v) {
2300     __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2301     __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31);
2302     return signBits;
2303 }
2304 
2305 // Implementation of _mm256_srai_epi64 (from AVX512).
libdivide_s64_shift_right_vec256(__m256i v,int amt)2306 static LIBDIVIDE_INLINE __m256i libdivide_s64_shift_right_vec256(__m256i v, int amt) {
2307     const int b = 64 - amt;
2308     __m256i m = _mm256_set1_epi64x((uint64_t)1 << (b - 1));
2309     __m256i x = _mm256_srli_epi64(v, amt);
2310     __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m);
2311     return result;
2312 }
2313 
2314 // Here, b is assumed to contain one 32-bit value repeated.
libdivide_mullhi_u32_vec256(__m256i a,__m256i b)2315 static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u32_vec256(__m256i a, __m256i b) {
2316     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32);
2317     __m256i a1X3X = _mm256_srli_epi64(a, 32);
2318     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2319     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask);
2320     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2321 }
2322 
2323 // b is one 32-bit value repeated.
libdivide_mullhi_s32_vec256(__m256i a,__m256i b)2324 static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s32_vec256(__m256i a, __m256i b) {
2325     __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32);
2326     __m256i a1X3X = _mm256_srli_epi64(a, 32);
2327     __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0);
2328     __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask);
2329     return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3);
2330 }
2331 
2332 // Here, y is assumed to contain one 64-bit value repeated.
libdivide_mullhi_u64_vec256(__m256i x,__m256i y)2333 static LIBDIVIDE_INLINE __m256i libdivide_mullhi_u64_vec256(__m256i x, __m256i y) {
2334     // see m128i variant for comments.
2335     __m256i x0y0 = _mm256_mul_epu32(x, y);
2336     __m256i x0y0_hi = _mm256_srli_epi64(x0y0, 32);
2337 
2338     __m256i x1 = _mm256_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2339     __m256i y1 = _mm256_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2340 
2341     __m256i x0y1 = _mm256_mul_epu32(x, y1);
2342     __m256i x1y0 = _mm256_mul_epu32(x1, y);
2343     __m256i x1y1 = _mm256_mul_epu32(x1, y1);
2344 
2345     __m256i mask = _mm256_set1_epi64x(0xFFFFFFFF);
2346     __m256i temp = _mm256_add_epi64(x1y0, x0y0_hi);
2347     __m256i temp_lo = _mm256_and_si256(temp, mask);
2348     __m256i temp_hi = _mm256_srli_epi64(temp, 32);
2349 
2350     temp_lo = _mm256_srli_epi64(_mm256_add_epi64(temp_lo, x0y1), 32);
2351     temp_hi = _mm256_add_epi64(x1y1, temp_hi);
2352     return _mm256_add_epi64(temp_lo, temp_hi);
2353 }
2354 
2355 // y is one 64-bit value repeated.
libdivide_mullhi_s64_vec256(__m256i x,__m256i y)2356 static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y) {
2357     __m256i p = libdivide_mullhi_u64_vec256(x, y);
2358     __m256i t1 = _mm256_and_si256(libdivide_s64_signbits_vec256(x), y);
2359     __m256i t2 = _mm256_and_si256(libdivide_s64_signbits_vec256(y), x);
2360     p = _mm256_sub_epi64(p, t1);
2361     p = _mm256_sub_epi64(p, t2);
2362     return p;
2363 }
2364 
2365 ////////// UINT16
2366 
libdivide_u16_do_vec256(__m256i numers,const struct libdivide_u16_t * denom)2367 __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
2368     SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
2369 }
2370 
libdivide_u16_branchfree_do_vec256(__m256i numers,const struct libdivide_u16_branchfree_t * denom)2371 __m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
2372     SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
2373 }
2374 
2375 ////////// UINT32
2376 
libdivide_u32_do_vec256(__m256i numers,const struct libdivide_u32_t * denom)2377 __m256i libdivide_u32_do_vec256(__m256i numers, const struct libdivide_u32_t *denom) {
2378     uint8_t more = denom->more;
2379     if (!denom->magic) {
2380         return _mm256_srli_epi32(numers, more);
2381     } else {
2382         __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2383         if (more & LIBDIVIDE_ADD_MARKER) {
2384             // uint32_t t = ((numer - q) >> 1) + q;
2385             // return t >> denom->shift;
2386             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2387             __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2388             return _mm256_srli_epi32(t, shift);
2389         } else {
2390             return _mm256_srli_epi32(q, more);
2391         }
2392     }
2393 }
2394 
libdivide_u32_branchfree_do_vec256(__m256i numers,const struct libdivide_u32_branchfree_t * denom)2395 __m256i libdivide_u32_branchfree_do_vec256(
2396     __m256i numers, const struct libdivide_u32_branchfree_t *denom) {
2397     __m256i q = libdivide_mullhi_u32_vec256(numers, _mm256_set1_epi32(denom->magic));
2398     __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q);
2399     return _mm256_srli_epi32(t, denom->more);
2400 }
2401 
2402 ////////// UINT64
2403 
libdivide_u64_do_vec256(__m256i numers,const struct libdivide_u64_t * denom)2404 __m256i libdivide_u64_do_vec256(__m256i numers, const struct libdivide_u64_t *denom) {
2405     uint8_t more = denom->more;
2406     if (!denom->magic) {
2407         return _mm256_srli_epi64(numers, more);
2408     } else {
2409         __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2410         if (more & LIBDIVIDE_ADD_MARKER) {
2411             // uint32_t t = ((numer - q) >> 1) + q;
2412             // return t >> denom->shift;
2413             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2414             __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2415             return _mm256_srli_epi64(t, shift);
2416         } else {
2417             return _mm256_srli_epi64(q, more);
2418         }
2419     }
2420 }
2421 
libdivide_u64_branchfree_do_vec256(__m256i numers,const struct libdivide_u64_branchfree_t * denom)2422 __m256i libdivide_u64_branchfree_do_vec256(
2423     __m256i numers, const struct libdivide_u64_branchfree_t *denom) {
2424     __m256i q = libdivide_mullhi_u64_vec256(numers, _mm256_set1_epi64x(denom->magic));
2425     __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q);
2426     return _mm256_srli_epi64(t, denom->more);
2427 }
2428 
2429 ////////// SINT16
2430 
libdivide_s16_do_vec256(__m256i numers,const struct libdivide_s16_t * denom)2431 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
2432     SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
2433 }
2434 
libdivide_s16_branchfree_do_vec256(__m256i numers,const struct libdivide_s16_branchfree_t * denom)2435 __m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
2436     SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
2437 }
2438 
2439 ////////// SINT32
2440 
libdivide_s32_do_vec256(__m256i numers,const struct libdivide_s32_t * denom)2441 __m256i libdivide_s32_do_vec256(__m256i numers, const struct libdivide_s32_t *denom) {
2442     uint8_t more = denom->more;
2443     if (!denom->magic) {
2444         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2445         uint32_t mask = ((uint32_t)1 << shift) - 1;
2446         __m256i roundToZeroTweak = _mm256_set1_epi32(mask);
2447         // q = numer + ((numer >> 31) & roundToZeroTweak);
2448         __m256i q = _mm256_add_epi32(
2449             numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak));
2450         q = _mm256_srai_epi32(q, shift);
2451         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2452         // q = (q ^ sign) - sign;
2453         q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);
2454         return q;
2455     } else {
2456         __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(denom->magic));
2457         if (more & LIBDIVIDE_ADD_MARKER) {
2458             // must be arithmetic shift
2459             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2460             // q += ((numer ^ sign) - sign);
2461             q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign));
2462         }
2463         // q >>= shift
2464         q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2465         q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31));  // q += (q < 0)
2466         return q;
2467     }
2468 }
2469 
libdivide_s32_branchfree_do_vec256(__m256i numers,const struct libdivide_s32_branchfree_t * denom)2470 __m256i libdivide_s32_branchfree_do_vec256(
2471     __m256i numers, const struct libdivide_s32_branchfree_t *denom) {
2472     int32_t magic = denom->magic;
2473     uint8_t more = denom->more;
2474     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2475     // must be arithmetic shift
2476     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2477     __m256i q = libdivide_mullhi_s32_vec256(numers, _mm256_set1_epi32(magic));
2478     q = _mm256_add_epi32(q, numers);  // q += numers
2479 
2480     // If q is non-negative, we have nothing to do
2481     // If q is negative, we want to add either (2**shift)-1 if d is
2482     // a power of 2, or (2**shift) if it is not a power of 2
2483     uint32_t is_power_of_2 = (magic == 0);
2484     __m256i q_sign = _mm256_srai_epi32(q, 31);  // q_sign = q >> 31
2485     __m256i mask = _mm256_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2486     q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
2487     q = _mm256_srai_epi32(q, shift);                          // q >>= shift
2488     q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
2489     return q;
2490 }
2491 
2492 ////////// SINT64
2493 
libdivide_s64_do_vec256(__m256i numers,const struct libdivide_s64_t * denom)2494 __m256i libdivide_s64_do_vec256(__m256i numers, const struct libdivide_s64_t *denom) {
2495     uint8_t more = denom->more;
2496     int64_t magic = denom->magic;
2497     if (magic == 0) {  // shift path
2498         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2499         uint64_t mask = ((uint64_t)1 << shift) - 1;
2500         __m256i roundToZeroTweak = _mm256_set1_epi64x(mask);
2501         // q = numer + ((numer >> 63) & roundToZeroTweak);
2502         __m256i q = _mm256_add_epi64(
2503             numers, _mm256_and_si256(libdivide_s64_signbits_vec256(numers), roundToZeroTweak));
2504         q = libdivide_s64_shift_right_vec256(q, shift);
2505         __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2506         // q = (q ^ sign) - sign;
2507         q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);
2508         return q;
2509     } else {
2510         __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2511         if (more & LIBDIVIDE_ADD_MARKER) {
2512             // must be arithmetic shift
2513             __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2514             // q += ((numer ^ sign) - sign);
2515             q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign));
2516         }
2517         // q >>= denom->mult_path.shift
2518         q = libdivide_s64_shift_right_vec256(q, more & LIBDIVIDE_64_SHIFT_MASK);
2519         q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63));  // q += (q < 0)
2520         return q;
2521     }
2522 }
2523 
libdivide_s64_branchfree_do_vec256(__m256i numers,const struct libdivide_s64_branchfree_t * denom)2524 __m256i libdivide_s64_branchfree_do_vec256(
2525     __m256i numers, const struct libdivide_s64_branchfree_t *denom) {
2526     int64_t magic = denom->magic;
2527     uint8_t more = denom->more;
2528     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2529     // must be arithmetic shift
2530     __m256i sign = _mm256_set1_epi32((int8_t)more >> 7);
2531 
2532     // libdivide_mullhi_s64(numers, magic);
2533     __m256i q = libdivide_mullhi_s64_vec256(numers, _mm256_set1_epi64x(magic));
2534     q = _mm256_add_epi64(q, numers);  // q += numers
2535 
2536     // If q is non-negative, we have nothing to do.
2537     // If q is negative, we want to add either (2**shift)-1 if d is
2538     // a power of 2, or (2**shift) if it is not a power of 2.
2539     uint32_t is_power_of_2 = (magic == 0);
2540     __m256i q_sign = libdivide_s64_signbits_vec256(q);  // q_sign = q >> 63
2541     __m256i mask = _mm256_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
2542     q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask));  // q = q + (q_sign & mask)
2543     q = libdivide_s64_shift_right_vec256(q, shift);           // q >>= shift
2544     q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign);    // q = (q ^ sign) - sign
2545     return q;
2546 }
2547 
2548 #endif
2549 
2550 #if defined(LIBDIVIDE_SSE2)
2551 
2552 static LIBDIVIDE_INLINE __m128i libdivide_u16_do_vec128(
2553     __m128i numers, const struct libdivide_u16_t *denom);
2554 static LIBDIVIDE_INLINE __m128i libdivide_s16_do_vec128(
2555     __m128i numers, const struct libdivide_s16_t *denom);
2556 static LIBDIVIDE_INLINE __m128i libdivide_u32_do_vec128(
2557     __m128i numers, const struct libdivide_u32_t *denom);
2558 static LIBDIVIDE_INLINE __m128i libdivide_s32_do_vec128(
2559     __m128i numers, const struct libdivide_s32_t *denom);
2560 static LIBDIVIDE_INLINE __m128i libdivide_u64_do_vec128(
2561     __m128i numers, const struct libdivide_u64_t *denom);
2562 static LIBDIVIDE_INLINE __m128i libdivide_s64_do_vec128(
2563     __m128i numers, const struct libdivide_s64_t *denom);
2564 
2565 static LIBDIVIDE_INLINE __m128i libdivide_u16_branchfree_do_vec128(
2566     __m128i numers, const struct libdivide_u16_branchfree_t *denom);
2567 static LIBDIVIDE_INLINE __m128i libdivide_s16_branchfree_do_vec128(
2568     __m128i numers, const struct libdivide_s16_branchfree_t *denom);
2569 static LIBDIVIDE_INLINE __m128i libdivide_u32_branchfree_do_vec128(
2570     __m128i numers, const struct libdivide_u32_branchfree_t *denom);
2571 static LIBDIVIDE_INLINE __m128i libdivide_s32_branchfree_do_vec128(
2572     __m128i numers, const struct libdivide_s32_branchfree_t *denom);
2573 static LIBDIVIDE_INLINE __m128i libdivide_u64_branchfree_do_vec128(
2574     __m128i numers, const struct libdivide_u64_branchfree_t *denom);
2575 static LIBDIVIDE_INLINE __m128i libdivide_s64_branchfree_do_vec128(
2576     __m128i numers, const struct libdivide_s64_branchfree_t *denom);
2577 
2578 //////// Internal Utility Functions
2579 
2580 // Implementation of _mm_srai_epi64(v, 63) (from AVX512).
libdivide_s64_signbits_vec128(__m128i v)2581 static LIBDIVIDE_INLINE __m128i libdivide_s64_signbits_vec128(__m128i v) {
2582     __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1));
2583     __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31);
2584     return signBits;
2585 }
2586 
2587 // Implementation of _mm_srai_epi64 (from AVX512).
libdivide_s64_shift_right_vec128(__m128i v,int amt)2588 static LIBDIVIDE_INLINE __m128i libdivide_s64_shift_right_vec128(__m128i v, int amt) {
2589     const int b = 64 - amt;
2590     __m128i m = _mm_set1_epi64x((uint64_t)1 << (b - 1));
2591     __m128i x = _mm_srli_epi64(v, amt);
2592     __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m);
2593     return result;
2594 }
2595 
2596 // Here, b is assumed to contain one 32-bit value repeated.
libdivide_mullhi_u32_vec128(__m128i a,__m128i b)2597 static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u32_vec128(__m128i a, __m128i b) {
2598     __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32);
2599     __m128i a1X3X = _mm_srli_epi64(a, 32);
2600     __m128i mask = _mm_set_epi32(-1, 0, -1, 0);
2601     __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask);
2602     return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3);
2603 }
2604 
2605 // SSE2 does not have a signed multiplication instruction, but we can convert
2606 // unsigned to signed pretty efficiently. Again, b is just a 32 bit value
2607 // repeated four times.
libdivide_mullhi_s32_vec128(__m128i a,__m128i b)2608 static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s32_vec128(__m128i a, __m128i b) {
2609     __m128i p = libdivide_mullhi_u32_vec128(a, b);
2610     // t1 = (a >> 31) & y, arithmetic shift
2611     __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b);
2612     __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a);
2613     p = _mm_sub_epi32(p, t1);
2614     p = _mm_sub_epi32(p, t2);
2615     return p;
2616 }
2617 
2618 // Here, y is assumed to contain one 64-bit value repeated.
libdivide_mullhi_u64_vec128(__m128i x,__m128i y)2619 static LIBDIVIDE_INLINE __m128i libdivide_mullhi_u64_vec128(__m128i x, __m128i y) {
2620     // full 128 bits product is:
2621     // x0*y0 + (x0*y1 << 32) + (x1*y0 << 32) + (x1*y1 << 64)
2622     // Note x0,y0,x1,y1 are all conceptually uint32, products are 32x32->64.
2623 
2624     // Compute x0*y0.
2625     // Note x1, y1 are ignored by mul_epu32.
2626     __m128i x0y0 = _mm_mul_epu32(x, y);
2627     __m128i x0y0_hi = _mm_srli_epi64(x0y0, 32);
2628 
2629     // Get x1, y1 in the low bits.
2630     // We could shuffle or right shift. Shuffles are preferred as they preserve
2631     // the source register for the next computation.
2632     __m128i x1 = _mm_shuffle_epi32(x, _MM_SHUFFLE(3, 3, 1, 1));
2633     __m128i y1 = _mm_shuffle_epi32(y, _MM_SHUFFLE(3, 3, 1, 1));
2634 
2635     // No need to mask off top 32 bits for mul_epu32.
2636     __m128i x0y1 = _mm_mul_epu32(x, y1);
2637     __m128i x1y0 = _mm_mul_epu32(x1, y);
2638     __m128i x1y1 = _mm_mul_epu32(x1, y1);
2639 
2640     // Mask here selects low bits only.
2641     __m128i mask = _mm_set1_epi64x(0xFFFFFFFF);
2642     __m128i temp = _mm_add_epi64(x1y0, x0y0_hi);
2643     __m128i temp_lo = _mm_and_si128(temp, mask);
2644     __m128i temp_hi = _mm_srli_epi64(temp, 32);
2645 
2646     temp_lo = _mm_srli_epi64(_mm_add_epi64(temp_lo, x0y1), 32);
2647     temp_hi = _mm_add_epi64(x1y1, temp_hi);
2648     return _mm_add_epi64(temp_lo, temp_hi);
2649 }
2650 
2651 // y is one 64-bit value repeated.
libdivide_mullhi_s64_vec128(__m128i x,__m128i y)2652 static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y) {
2653     __m128i p = libdivide_mullhi_u64_vec128(x, y);
2654     __m128i t1 = _mm_and_si128(libdivide_s64_signbits_vec128(x), y);
2655     __m128i t2 = _mm_and_si128(libdivide_s64_signbits_vec128(y), x);
2656     p = _mm_sub_epi64(p, t1);
2657     p = _mm_sub_epi64(p, t2);
2658     return p;
2659 }
2660 
2661 ////////// UINT26
2662 
libdivide_u16_do_vec128(__m128i numers,const struct libdivide_u16_t * denom)2663 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
2664     SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
2665 }
2666 
libdivide_u16_branchfree_do_vec128(__m128i numers,const struct libdivide_u16_branchfree_t * denom)2667 __m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
2668     SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
2669 }
2670 
2671 ////////// UINT32
2672 
libdivide_u32_do_vec128(__m128i numers,const struct libdivide_u32_t * denom)2673 __m128i libdivide_u32_do_vec128(__m128i numers, const struct libdivide_u32_t *denom) {
2674     uint8_t more = denom->more;
2675     if (!denom->magic) {
2676         return _mm_srli_epi32(numers, more);
2677     } else {
2678         __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2679         if (more & LIBDIVIDE_ADD_MARKER) {
2680             // uint32_t t = ((numer - q) >> 1) + q;
2681             // return t >> denom->shift;
2682             uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2683             __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2684             return _mm_srli_epi32(t, shift);
2685         } else {
2686             return _mm_srli_epi32(q, more);
2687         }
2688     }
2689 }
2690 
libdivide_u32_branchfree_do_vec128(__m128i numers,const struct libdivide_u32_branchfree_t * denom)2691 __m128i libdivide_u32_branchfree_do_vec128(
2692     __m128i numers, const struct libdivide_u32_branchfree_t *denom) {
2693     __m128i q = libdivide_mullhi_u32_vec128(numers, _mm_set1_epi32(denom->magic));
2694     __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q);
2695     return _mm_srli_epi32(t, denom->more);
2696 }
2697 
2698 ////////// UINT64
2699 
libdivide_u64_do_vec128(__m128i numers,const struct libdivide_u64_t * denom)2700 __m128i libdivide_u64_do_vec128(__m128i numers, const struct libdivide_u64_t *denom) {
2701     uint8_t more = denom->more;
2702     if (!denom->magic) {
2703         return _mm_srli_epi64(numers, more);
2704     } else {
2705         __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2706         if (more & LIBDIVIDE_ADD_MARKER) {
2707             // uint32_t t = ((numer - q) >> 1) + q;
2708             // return t >> denom->shift;
2709             uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2710             __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2711             return _mm_srli_epi64(t, shift);
2712         } else {
2713             return _mm_srli_epi64(q, more);
2714         }
2715     }
2716 }
2717 
libdivide_u64_branchfree_do_vec128(__m128i numers,const struct libdivide_u64_branchfree_t * denom)2718 __m128i libdivide_u64_branchfree_do_vec128(
2719     __m128i numers, const struct libdivide_u64_branchfree_t *denom) {
2720     __m128i q = libdivide_mullhi_u64_vec128(numers, _mm_set1_epi64x(denom->magic));
2721     __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q);
2722     return _mm_srli_epi64(t, denom->more);
2723 }
2724 
2725 ////////// SINT16
2726 
libdivide_s16_do_vec128(__m128i numers,const struct libdivide_s16_t * denom)2727 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
2728     SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
2729 }
2730 
libdivide_s16_branchfree_do_vec128(__m128i numers,const struct libdivide_s16_branchfree_t * denom)2731 __m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
2732     SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
2733 }
2734 
2735 ////////// SINT32
2736 
libdivide_s32_do_vec128(__m128i numers,const struct libdivide_s32_t * denom)2737 __m128i libdivide_s32_do_vec128(__m128i numers, const struct libdivide_s32_t *denom) {
2738     uint8_t more = denom->more;
2739     if (!denom->magic) {
2740         uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2741         uint32_t mask = ((uint32_t)1 << shift) - 1;
2742         __m128i roundToZeroTweak = _mm_set1_epi32(mask);
2743         // q = numer + ((numer >> 31) & roundToZeroTweak);
2744         __m128i q =
2745             _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak));
2746         q = _mm_srai_epi32(q, shift);
2747         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2748         // q = (q ^ sign) - sign;
2749         q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);
2750         return q;
2751     } else {
2752         __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(denom->magic));
2753         if (more & LIBDIVIDE_ADD_MARKER) {
2754             // must be arithmetic shift
2755             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2756             // q += ((numer ^ sign) - sign);
2757             q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign));
2758         }
2759         // q >>= shift
2760         q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK);
2761         q = _mm_add_epi32(q, _mm_srli_epi32(q, 31));  // q += (q < 0)
2762         return q;
2763     }
2764 }
2765 
libdivide_s32_branchfree_do_vec128(__m128i numers,const struct libdivide_s32_branchfree_t * denom)2766 __m128i libdivide_s32_branchfree_do_vec128(
2767     __m128i numers, const struct libdivide_s32_branchfree_t *denom) {
2768     int32_t magic = denom->magic;
2769     uint8_t more = denom->more;
2770     uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
2771     // must be arithmetic shift
2772     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2773     __m128i q = libdivide_mullhi_s32_vec128(numers, _mm_set1_epi32(magic));
2774     q = _mm_add_epi32(q, numers);  // q += numers
2775 
2776     // If q is non-negative, we have nothing to do
2777     // If q is negative, we want to add either (2**shift)-1 if d is
2778     // a power of 2, or (2**shift) if it is not a power of 2
2779     uint32_t is_power_of_2 = (magic == 0);
2780     __m128i q_sign = _mm_srai_epi32(q, 31);  // q_sign = q >> 31
2781     __m128i mask = _mm_set1_epi32(((uint32_t)1 << shift) - is_power_of_2);
2782     q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
2783     q = _mm_srai_epi32(q, shift);                       // q >>= shift
2784     q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
2785     return q;
2786 }
2787 
2788 ////////// SINT64
2789 
libdivide_s64_do_vec128(__m128i numers,const struct libdivide_s64_t * denom)2790 __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *denom) {
2791     uint8_t more = denom->more;
2792     int64_t magic = denom->magic;
2793     if (magic == 0) {  // shift path
2794         uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2795         uint64_t mask = ((uint64_t)1 << shift) - 1;
2796         __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
2797         // q = numer + ((numer >> 63) & roundToZeroTweak);
2798         __m128i q =
2799             _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
2800         q = libdivide_s64_shift_right_vec128(q, shift);
2801         __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2802         // q = (q ^ sign) - sign;
2803         q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);
2804         return q;
2805     } else {
2806         __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
2807         if (more & LIBDIVIDE_ADD_MARKER) {
2808             // must be arithmetic shift
2809             __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2810             // q += ((numer ^ sign) - sign);
2811             q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign));
2812         }
2813         // q >>= denom->mult_path.shift
2814         q = libdivide_s64_shift_right_vec128(q, more & LIBDIVIDE_64_SHIFT_MASK);
2815         q = _mm_add_epi64(q, _mm_srli_epi64(q, 63));  // q += (q < 0)
2816         return q;
2817     }
2818 }
2819 
libdivide_s64_branchfree_do_vec128(__m128i numers,const struct libdivide_s64_branchfree_t * denom)2820 __m128i libdivide_s64_branchfree_do_vec128(
2821     __m128i numers, const struct libdivide_s64_branchfree_t *denom) {
2822     int64_t magic = denom->magic;
2823     uint8_t more = denom->more;
2824     uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
2825     // must be arithmetic shift
2826     __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
2827 
2828     // libdivide_mullhi_s64(numers, magic);
2829     __m128i q = libdivide_mullhi_s64_vec128(numers, _mm_set1_epi64x(magic));
2830     q = _mm_add_epi64(q, numers);  // q += numers
2831 
2832     // If q is non-negative, we have nothing to do.
2833     // If q is negative, we want to add either (2**shift)-1 if d is
2834     // a power of 2, or (2**shift) if it is not a power of 2.
2835     uint32_t is_power_of_2 = (magic == 0);
2836     __m128i q_sign = libdivide_s64_signbits_vec128(q);  // q_sign = q >> 63
2837     __m128i mask = _mm_set1_epi64x(((uint64_t)1 << shift) - is_power_of_2);
2838     q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask));  // q = q + (q_sign & mask)
2839     q = libdivide_s64_shift_right_vec128(q, shift);     // q >>= shift
2840     q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign);    // q = (q ^ sign) - sign
2841     return q;
2842 }
2843 
2844 #endif
2845 
2846 /////////// C++ stuff
2847 
2848 #ifdef __cplusplus
2849 
2850 enum Branching {
2851     BRANCHFULL,  // use branching algorithms
2852     BRANCHFREE   // use branchfree algorithms
2853 };
2854 
2855 #if defined(LIBDIVIDE_NEON)
2856 // Helper to deduce NEON vector type for integral type.
2857 template <typename T>
2858 struct NeonVecFor {};
2859 
2860 template <>
2861 struct NeonVecFor<uint16_t> {
2862     typedef uint16x8_t type;
2863 };
2864 
2865 template <>
2866 struct NeonVecFor<int16_t> {
2867     typedef int16x8_t type;
2868 };
2869 
2870 template <>
2871 struct NeonVecFor<uint32_t> {
2872     typedef uint32x4_t type;
2873 };
2874 
2875 template <>
2876 struct NeonVecFor<int32_t> {
2877     typedef int32x4_t type;
2878 };
2879 
2880 template <>
2881 struct NeonVecFor<uint64_t> {
2882     typedef uint64x2_t type;
2883 };
2884 
2885 template <>
2886 struct NeonVecFor<int64_t> {
2887     typedef int64x2_t type;
2888 };
2889 #endif
2890 
2891 // Versions of our algorithms for SIMD.
2892 #if defined(LIBDIVIDE_NEON)
2893 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)                    \
2894     LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
2895         typename NeonVecFor<INT_TYPE>::type n) const {           \
2896         return libdivide_##ALGO##_do_vec128(n, &denom);          \
2897     }
2898 #else
2899 #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
2900 #endif
2901 #if defined(LIBDIVIDE_SSE2)
2902 #define LIBDIVIDE_DIVIDE_SSE2(ALGO)                     \
2903     LIBDIVIDE_INLINE __m128i divide(__m128i n) const {  \
2904         return libdivide_##ALGO##_do_vec128(n, &denom); \
2905     }
2906 #else
2907 #define LIBDIVIDE_DIVIDE_SSE2(ALGO)
2908 #endif
2909 
2910 #if defined(LIBDIVIDE_AVX2)
2911 #define LIBDIVIDE_DIVIDE_AVX2(ALGO)                     \
2912     LIBDIVIDE_INLINE __m256i divide(__m256i n) const {  \
2913         return libdivide_##ALGO##_do_vec256(n, &denom); \
2914     }
2915 #else
2916 #define LIBDIVIDE_DIVIDE_AVX2(ALGO)
2917 #endif
2918 
2919 #if defined(LIBDIVIDE_AVX512)
2920 #define LIBDIVIDE_DIVIDE_AVX512(ALGO)                   \
2921     LIBDIVIDE_INLINE __m512i divide(__m512i n) const {  \
2922         return libdivide_##ALGO##_do_vec512(n, &denom); \
2923     }
2924 #else
2925 #define LIBDIVIDE_DIVIDE_AVX512(ALGO)
2926 #endif
2927 
2928 // The DISPATCHER_GEN() macro generates C++ methods (for the given integer
2929 // and algorithm types) that redirect to libdivide's C API.
2930 #define DISPATCHER_GEN(T, ALGO)                                                       \
2931     libdivide_##ALGO##_t denom;                                                       \
2932     LIBDIVIDE_INLINE dispatcher() {}                                                  \
2933     LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {}            \
2934     LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
2935     LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
2936     LIBDIVIDE_DIVIDE_NEON(ALGO, T)                                                    \
2937     LIBDIVIDE_DIVIDE_SSE2(ALGO)                                                       \
2938     LIBDIVIDE_DIVIDE_AVX2(ALGO)                                                       \
2939     LIBDIVIDE_DIVIDE_AVX512(ALGO)
2940 
2941 // The dispatcher selects a specific division algorithm for a given
2942 // type and ALGO using partial template specialization.
2943 template <typename _IntT, Branching ALGO>
2944 struct dispatcher {};
2945 
2946 template <>
2947 struct dispatcher<int16_t, BRANCHFULL> {
2948     DISPATCHER_GEN(int16_t, s16)
2949 };
2950 template <>
2951 struct dispatcher<int16_t, BRANCHFREE> {
2952     DISPATCHER_GEN(int16_t, s16_branchfree)
2953 };
2954 template <>
2955 struct dispatcher<uint16_t, BRANCHFULL> {
2956     DISPATCHER_GEN(uint16_t, u16)
2957 };
2958 template <>
2959 struct dispatcher<uint16_t, BRANCHFREE> {
2960     DISPATCHER_GEN(uint16_t, u16_branchfree)
2961 };
2962 template <>
2963 struct dispatcher<int32_t, BRANCHFULL> {
2964     DISPATCHER_GEN(int32_t, s32)
2965 };
2966 template <>
2967 struct dispatcher<int32_t, BRANCHFREE> {
2968     DISPATCHER_GEN(int32_t, s32_branchfree)
2969 };
2970 template <>
2971 struct dispatcher<uint32_t, BRANCHFULL> {
2972     DISPATCHER_GEN(uint32_t, u32)
2973 };
2974 template <>
2975 struct dispatcher<uint32_t, BRANCHFREE> {
2976     DISPATCHER_GEN(uint32_t, u32_branchfree)
2977 };
2978 template <>
2979 struct dispatcher<int64_t, BRANCHFULL> {
2980     DISPATCHER_GEN(int64_t, s64)
2981 };
2982 template <>
2983 struct dispatcher<int64_t, BRANCHFREE> {
2984     DISPATCHER_GEN(int64_t, s64_branchfree)
2985 };
2986 template <>
2987 struct dispatcher<uint64_t, BRANCHFULL> {
2988     DISPATCHER_GEN(uint64_t, u64)
2989 };
2990 template <>
2991 struct dispatcher<uint64_t, BRANCHFREE> {
2992     DISPATCHER_GEN(uint64_t, u64_branchfree)
2993 };
2994 
2995 // This is the main divider class for use by the user (C++ API).
2996 // The actual division algorithm is selected using the dispatcher struct
2997 // based on the integer and algorithm template parameters.
2998 template <typename T, Branching ALGO = BRANCHFULL>
2999 class divider {
3000    private:
3001     typedef dispatcher<T, ALGO> dispatcher_t;
3002 
3003    public:
3004     // We leave the default constructor empty so that creating
3005     // an array of dividers and then initializing them
3006     // later doesn't slow us down.
3007     divider() {}
3008 
3009     // Constructor that takes the divisor as a parameter
3010     LIBDIVIDE_INLINE divider(T d) : div(d) {}
3011 
3012     // Divides n by the divisor
3013     LIBDIVIDE_INLINE T divide(T n) const { return div.divide(n); }
3014 
3015     // Recovers the divisor, returns the value that was
3016     // used to initialize this divider object.
3017     T recover() const { return div.recover(); }
3018 
3019     bool operator==(const divider<T, ALGO> &other) const {
3020         return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
3021     }
3022 
3023     bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
3024 
3025     // Vector variants treat the input as packed integer values with the same type as the divider
3026     // (e.g. s32, u32, s64, u64) and divides each of them by the divider, returning the packed
3027     // quotients.
3028 #if defined(LIBDIVIDE_SSE2)
3029     LIBDIVIDE_INLINE __m128i divide(__m128i n) const { return div.divide(n); }
3030 #endif
3031 #if defined(LIBDIVIDE_AVX2)
3032     LIBDIVIDE_INLINE __m256i divide(__m256i n) const { return div.divide(n); }
3033 #endif
3034 #if defined(LIBDIVIDE_AVX512)
3035     LIBDIVIDE_INLINE __m512i divide(__m512i n) const { return div.divide(n); }
3036 #endif
3037 #if defined(LIBDIVIDE_NEON)
3038     LIBDIVIDE_INLINE typename NeonVecFor<T>::type divide(typename NeonVecFor<T>::type n) const {
3039         return div.divide(n);
3040     }
3041 #endif
3042 
3043    private:
3044     // Storage for the actual divisor
3045     dispatcher_t div;
3046 };
3047 
3048 // Overload of operator / for scalar division
3049 template <typename T, Branching ALGO>
3050 LIBDIVIDE_INLINE T operator/(T n, const divider<T, ALGO> &div) {
3051     return div.divide(n);
3052 }
3053 
3054 // Overload of operator /= for scalar division
3055 template <typename T, Branching ALGO>
3056 LIBDIVIDE_INLINE T &operator/=(T &n, const divider<T, ALGO> &div) {
3057     n = div.divide(n);
3058     return n;
3059 }
3060 
3061 // Overloads for vector types.
3062 #if defined(LIBDIVIDE_SSE2)
3063 template <typename T, Branching ALGO>
3064 LIBDIVIDE_INLINE __m128i operator/(__m128i n, const divider<T, ALGO> &div) {
3065     return div.divide(n);
3066 }
3067 
3068 template <typename T, Branching ALGO>
3069 LIBDIVIDE_INLINE __m128i operator/=(__m128i &n, const divider<T, ALGO> &div) {
3070     n = div.divide(n);
3071     return n;
3072 }
3073 #endif
3074 #if defined(LIBDIVIDE_AVX2)
3075 template <typename T, Branching ALGO>
3076 LIBDIVIDE_INLINE __m256i operator/(__m256i n, const divider<T, ALGO> &div) {
3077     return div.divide(n);
3078 }
3079 
3080 template <typename T, Branching ALGO>
3081 LIBDIVIDE_INLINE __m256i operator/=(__m256i &n, const divider<T, ALGO> &div) {
3082     n = div.divide(n);
3083     return n;
3084 }
3085 #endif
3086 #if defined(LIBDIVIDE_AVX512)
3087 template <typename T, Branching ALGO>
3088 LIBDIVIDE_INLINE __m512i operator/(__m512i n, const divider<T, ALGO> &div) {
3089     return div.divide(n);
3090 }
3091 
3092 template <typename T, Branching ALGO>
3093 LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
3094     n = div.divide(n);
3095     return n;
3096 }
3097 #endif
3098 
3099 #if defined(LIBDIVIDE_NEON)
3100 template <typename T, Branching ALGO>
3101 LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
3102     return div.divide(n);
3103 }
3104 
3105 template <typename T, Branching ALGO>
3106 LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
3107     n = div.divide(n);
3108     return n;
3109 }
3110 #endif
3111 
3112 #if __cplusplus >= 201103L || (defined(_MSC_VER) && _MSC_VER >= 1900)
3113 // libdivide::branchfree_divider<T>
3114 template <typename T>
3115 using branchfree_divider = divider<T, BRANCHFREE>;
3116 #endif
3117 
3118 }  // namespace libdivide
3119 
3120 #endif  // __cplusplus
3121 
3122 #if defined(_MSC_VER)
3123 #pragma warning(pop)
3124 #endif
3125 
3126 #endif  // LIBDIVIDE_H
3127