1 /*
2  * Copyright © 2018 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  */
23 
24 #ifndef FAST_IDIV_BY_CONST_H
25 #define FAST_IDIV_BY_CONST_H
26 
27 /* Imported from:
28  *   https://raw.githubusercontent.com/ridiculousfish/libdivide/master/divide_by_constants_codegen_reference.c
29  */
30 
31 #include <inttypes.h>
32 #include <limits.h>
33 #include <assert.h>
34 
35 #ifdef __cplusplus
36 extern "C" {
37 #endif
38 
39 /* Computes "magic info" for performing signed division by a fixed integer D.
40  * The type 'sint_t' is assumed to be defined as a signed integer type large
41  * enough to hold both the dividend and the divisor.
42  * Here >> is arithmetic (signed) shift, and >>> is logical shift.
43  *
44  * To emit code for n/d, rounding towards zero, use the following sequence:
45  *
46  *   m = compute_signed_magic_info(D)
47  *   emit("result = (m.multiplier * n) >> SINT_BITS");
48  *   if d > 0 and m.multiplier < 0: emit("result += n")
49  *   if d < 0 and m.multiplier > 0: emit("result -= n")
50  *   if m.post_shift > 0: emit("result >>= m.shift")
51  *   emit("result += (result < 0)")
52  *
53  * The shifts by SINT_BITS may be "free" if the high half of the full multiply
54  * is put in a separate register.
55  *
56  * The final add can of course be implemented via the sign bit, e.g.
57  *    result += (result >>> (SINT_BITS - 1))
58  * or
59  *    result -= (result >> (SINT_BITS - 1))
60  *
61  * This code is heavily indebted to Hacker's Delight by Henry Warren.
62  * See http://www.hackersdelight.org/HDcode/magic.c.txt
63  * Used with permission from http://www.hackersdelight.org/permissions.htm
64  */
65 
66 struct util_fast_sdiv_info {
67    int64_t multiplier; /* the "magic number" multiplier */
68    unsigned shift; /* shift for the dividend after multiplying */
69 };
70 
71 struct util_fast_sdiv_info
72 util_compute_fast_sdiv_info(int64_t D, unsigned SINT_BITS);
73 
74 /* Computes "magic info" for performing unsigned division by a fixed positive
75  * integer D.  UINT_BITS is the bit size at which the final "magic"
76  * calculation will be performed; it is assumed to be large enough to hold
77  * both the dividand and the divisor.  num_bits can be set appropriately if n
78  * is known to be smaller than calc_bits; if this is not known then UINT_BITS
79  * for num_bits.
80  *
81  * Assume we have a hardware register of width UINT_BITS, a known constant D
82  * which is not zero and not a power of 2, and a variable n of width num_bits
83  * (which may be up to UINT_BITS). To emit code for n/d, use one of the two
84  * following sequences (here >>> refers to a logical bitshift):
85  *
86  *   m = compute_unsigned_magic_info(D, num_bits)
87  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
88  *   if m.increment: emit("n = saturated_increment(n)")
89  *   emit("result = (m.multiplier * n) >>> UINT_BITS")
90  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
91  *
92  * or
93  *
94  *   m = compute_unsigned_magic_info(D, num_bits)
95  *   if m.pre_shift > 0: emit("n >>>= m.pre_shift")
96  *   emit("result = m.multiplier * n")
97  *   if m.increment: emit("result = result + m.multiplier")
98  *   emit("result >>>= UINT_BITS")
99  *   if m.post_shift > 0: emit("result >>>= m.post_shift")
100  *
101  * This second version works even if D is 1.  The shifts by UINT_BITS may be
102  * "free" if the high half of the full multiply is put in a separate register.
103  *
104  * saturated_increment(n) means "increment n unless it would wrap to 0," i.e.
105  *   if n == (1 << UINT_BITS)-1: result = n
106  *   else: result = n+1
107  * A common way to implement this is with the carry bit. For example, on x86:
108  *   add 1
109  *   sbb 0
110  *
111  * Some invariants:
112  *   1: At least one of pre_shift and increment is zero
113  *   2: multiplier is never zero
114  *
115  * This code incorporates the "round down" optimization per ridiculous_fish.
116  */
117 
118 struct util_fast_udiv_info {
119    uint64_t multiplier; /* the "magic number" multiplier */
120    unsigned pre_shift; /* shift for the dividend before multiplying */
121    unsigned post_shift; /* shift for the dividend after multiplying */
122    int increment; /* 0 or 1; if set then increment the numerator, using one of
123                      the two strategies */
124 };
125 
126 struct util_fast_udiv_info
127 util_compute_fast_udiv_info(uint64_t D, unsigned num_bits, unsigned UINT_BITS);
128 
129 /* Below are possible options for dividing by a uniform in a shader where
130  * the divisor is constant but not known at compile time.
131  */
132 
133 /* Full version. */
134 static inline uint32_t
util_fast_udiv32(uint32_t n,struct util_fast_udiv_info info)135 util_fast_udiv32(uint32_t n, struct util_fast_udiv_info info)
136 {
137    n = n >> info.pre_shift;
138    /* If the divisor is not 1, you can instead use a 32-bit ADD that clamps
139     * to UINT_MAX. Dividing by 1 needs the full 64-bit ADD.
140     *
141     * If you have unsigned 64-bit MAD with 32-bit inputs, you can do:
142     *    increment = increment ? multiplier : 0; // on the CPU
143     *    (n * multiplier + increment) // on the GPU using unsigned 64-bit MAD
144     */
145    n = (((uint64_t)n + info.increment) * info.multiplier) >> 32;
146    n = n >> info.post_shift;
147    return n;
148 }
149 
150 /* A little more efficient version if n != UINT_MAX, i.e. no unsigned
151  * wraparound in the computation.
152  */
153 static inline uint32_t
util_fast_udiv32_nuw(uint32_t n,struct util_fast_udiv_info info)154 util_fast_udiv32_nuw(uint32_t n, struct util_fast_udiv_info info)
155 {
156    assert(n != UINT32_MAX);
157    n = n >> info.pre_shift;
158    n = n + info.increment;
159    n = ((uint64_t)n * info.multiplier) >> 32;
160    n = n >> info.post_shift;
161    return n;
162 }
163 
164 /* Even faster version but both operands must be 31-bit unsigned integers
165  * and the divisor must be greater than 1.
166  *
167  * info must be computed with num_bits == 31.
168  */
169 static inline uint32_t
util_fast_udiv32_u31_d_not_one(uint32_t n,struct util_fast_udiv_info info)170 util_fast_udiv32_u31_d_not_one(uint32_t n, struct util_fast_udiv_info info)
171 {
172    assert(info.pre_shift == 0);
173    assert(info.increment == 0);
174    n = ((uint64_t)n * info.multiplier) >> 32;
175    n = n >> info.post_shift;
176    return n;
177 }
178 
179 #ifdef __cplusplus
180 } /* extern C */
181 #endif
182 
183 #endif /* FAST_IDIV_BY_CONST_H */
184