1 /*
2  * Copyright (c) 2015-2017, Intel Corporation
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions are met:
6  *
7  *  * Redistributions of source code must retain the above copyright notice,
8  *    this list of conditions and the following disclaimer.
9  *  * Redistributions in binary form must reproduce the above copyright
10  *    notice, this list of conditions and the following disclaimer in the
11  *    documentation and/or other materials provided with the distribution.
12  *  * Neither the name of Intel Corporation nor the names of its contributors
13  *    may be used to endorse or promote products derived from this software
14  *    without specific prior written permission.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
20  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26  * POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /** \file
30  * \brief Bit-twiddling primitives (ctz, compress etc)
31  */
32 
33 #ifndef BITUTILS_H
34 #define BITUTILS_H
35 
36 #include "ue2common.h"
37 #include "popcount.h"
38 #include "util/arch.h"
39 #include "util/intrinsics.h"
40 
41 #define CASE_BIT          0x20
42 #define CASE_CLEAR        0xdf
43 #define DOUBLE_CASE_CLEAR 0xdfdf
44 #define OCTO_CASE_CLEAR   0xdfdfdfdfdfdfdfdfULL
45 
46 static really_inline
clz32(u32 x)47 u32 clz32(u32 x) {
48     assert(x); // behaviour not defined for x == 0
49 #if defined(_WIN32)
50     unsigned long r;
51     _BitScanReverse(&r, x);
52     return 31 - r;
53 #else
54     return (u32)__builtin_clz(x);
55 #endif
56 }
57 
58 static really_inline
clz64(u64a x)59 u32 clz64(u64a x) {
60     assert(x); // behaviour not defined for x == 0
61 #if defined(_WIN64)
62     unsigned long r;
63     _BitScanReverse64(&r, x);
64     return 63 - r;
65 #elif defined(_WIN32)
66     unsigned long x1 = (u32)x;
67     unsigned long x2 = (u32)(x >> 32);
68     unsigned long r;
69     if (x2) {
70         _BitScanReverse(&r, x2);
71         return (u32)(31 - r);
72     }
73     _BitScanReverse(&r, (u32)x1);
74     return (u32)(63 - r);
75 #else
76     return (u32)__builtin_clzll(x);
77 #endif
78 }
79 
80 // CTZ (count trailing zero) implementations.
81 static really_inline
ctz32(u32 x)82 u32 ctz32(u32 x) {
83     assert(x); // behaviour not defined for x == 0
84 #if defined(_WIN32)
85     unsigned long r;
86     _BitScanForward(&r, x);
87     return r;
88 #else
89     return (u32)__builtin_ctz(x);
90 #endif
91 }
92 
93 static really_inline
ctz64(u64a x)94 u32 ctz64(u64a x) {
95     assert(x); // behaviour not defined for x == 0
96 #if defined(_WIN64)
97     unsigned long r;
98     _BitScanForward64(&r, x);
99     return r;
100 #elif defined(_WIN32)
101     unsigned long r;
102     if (_BitScanForward(&r, (u32)x)) {
103         return (u32)r;
104     }
105     _BitScanForward(&r, x >> 32);
106     return (u32)(r + 32);
107 #else
108     return (u32)__builtin_ctzll(x);
109 #endif
110 }
111 
112 static really_inline
lg2(u32 x)113 u32 lg2(u32 x) {
114     if (!x) {
115         return 0;
116     }
117     return 31 - clz32(x);
118 }
119 
120 static really_inline
lg2_64(u64a x)121 u64a lg2_64(u64a x) {
122     if (!x) {
123         return 0;
124     }
125     return 63 - clz64(x);
126 }
127 
128 static really_inline
findAndClearLSB_32(u32 * v)129 u32 findAndClearLSB_32(u32 *v) {
130     assert(*v != 0); // behaviour not defined in this case
131 #ifndef NO_ASM
132     u32 val = *v, offset;
133     __asm__ ("bsf %1, %0\n"
134              "btr %0, %1\n"
135              : "=r" (offset), "=r" (val)
136              : "1" (val));
137     *v = val;
138 #else
139     u32 val = *v;
140     u32 offset = ctz32(val);
141     *v = val & (val - 1);
142 #endif
143 
144     assert(offset < 32);
145     return offset;
146 }
147 
148 static really_inline
findAndClearLSB_64(u64a * v)149 u32 findAndClearLSB_64(u64a *v) {
150     assert(*v != 0); // behaviour not defined in this case
151 
152 #ifdef ARCH_64_BIT
153 #if defined(ARCH_X86_64) && !defined(NO_ASM)
154     u64a val = *v, offset;
155     __asm__ ("bsfq %1, %0\n"
156              "btrq %0, %1\n"
157              : "=r" (offset), "=r" (val)
158              : "1" (val));
159     *v = val;
160 #else
161     // generic variant using gcc's builtin on 64-bit
162     u64a val = *v, offset;
163     offset = ctz64(val);
164     *v = val & (val - 1);
165 #endif // ARCH_X86_64
166 #else
167     // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
168     // inline calls to __builtin_ctzll
169     u32 v1 = (u32)*v;
170     u32 v2 = (u32)(*v >> 32);
171     u32 offset;
172     if (v1) {
173         offset = findAndClearLSB_32(&v1);
174         *v = (u64a)v1 | ((u64a)v2 << 32);
175     } else {
176         offset = findAndClearLSB_32(&v2) + 32;
177         *v = (u64a)v2 << 32;
178     }
179 #endif
180 
181     assert(offset < 64);
182     return (u32)offset;
183 }
184 
185 static really_inline
findAndClearMSB_32(u32 * v)186 u32 findAndClearMSB_32(u32 *v) {
187     assert(*v != 0); // behaviour not defined in this case
188 #ifndef NO_ASM
189     u32 val = *v, offset;
190     __asm__ ("bsr %1, %0\n"
191              "btr %0, %1\n"
192              : "=r" (offset), "=r" (val)
193              : "1" (val));
194     *v = val;
195 #else
196     u32 val = *v;
197     u32 offset = 31 - clz32(val);
198     *v = val & ~(1 << offset);
199 #endif
200     assert(offset < 32);
201     return offset;
202 }
203 
204 static really_inline
findAndClearMSB_64(u64a * v)205 u32 findAndClearMSB_64(u64a *v) {
206     assert(*v != 0); // behaviour not defined in this case
207 
208 #ifdef ARCH_64_BIT
209 #if defined(ARCH_X86_64) && !defined(NO_ASM)
210     u64a val = *v, offset;
211     __asm__ ("bsrq %1, %0\n"
212              "btrq %0, %1\n"
213              : "=r" (offset), "=r" (val)
214              : "1" (val));
215     *v = val;
216 #else
217     // generic variant using gcc's builtin on 64-bit
218     u64a val = *v, offset;
219     offset = 63 - clz64(val);
220     *v = val & ~(1ULL << offset);
221 #endif // ARCH_X86_64
222 #else
223     // fall back to doing things with two 32-bit cases, since gcc-4.1 doesn't
224     // inline calls to __builtin_ctzll
225     u32 v1 = (u32)*v;
226     u32 v2 = (*v >> 32);
227     u32 offset;
228     if (v2) {
229         offset = findAndClearMSB_32(&v2) + 32;
230         *v = ((u64a)v2 << 32) | (u64a)v1;
231     } else {
232         offset = findAndClearMSB_32(&v1);
233         *v = (u64a)v1;
234     }
235 #endif
236 
237     assert(offset < 64);
238     return (u32)offset;
239 }
240 
241 static really_inline
compress32(u32 x,u32 m)242 u32 compress32(u32 x, u32 m) {
243 #if defined(HAVE_BMI2)
244     // BMI2 has a single instruction for this operation.
245     return _pext_u32(x, m);
246 #else
247 
248     // Return zero quickly on trivial cases
249     if ((x & m) == 0) {
250         return 0;
251     }
252 
253     u32 mk, mp, mv, t;
254 
255     x &= m; // clear irrelevant bits
256 
257     mk = ~m << 1; // we will count 0's to right
258     for (u32 i = 0; i < 5; i++) {
259         mp = mk ^ (mk << 1);
260         mp ^= mp << 2;
261         mp ^= mp << 4;
262         mp ^= mp << 8;
263         mp ^= mp << 16;
264 
265         mv = mp & m; // bits to move
266         m = (m ^ mv) | (mv >> (1 << i)); // compress m
267         t = x & mv;
268         x = (x ^ t) | (t >> (1 << i)); // compress x
269         mk = mk & ~mp;
270     }
271 
272     return x;
273 #endif
274 }
275 
276 static really_inline
compress64(u64a x,u64a m)277 u64a compress64(u64a x, u64a m) {
278 #if defined(ARCH_X86_64) && defined(HAVE_BMI2)
279     // BMI2 has a single instruction for this operation.
280     return _pext_u64(x, m);
281 #else
282 
283     // Return zero quickly on trivial cases
284     if ((x & m) == 0) {
285         return 0;
286     }
287 
288     u64a mk, mp, mv, t;
289 
290     x &= m; // clear irrelevant bits
291 
292     mk = ~m << 1; // we will count 0's to right
293     for (u32 i = 0; i < 6; i++) {
294         mp = mk ^ (mk << 1);
295         mp ^= mp << 2;
296         mp ^= mp << 4;
297         mp ^= mp << 8;
298         mp ^= mp << 16;
299         mp ^= mp << 32;
300 
301         mv = mp & m; // bits to move
302         m = (m ^ mv) | (mv >> (1 << i)); // compress m
303         t = x & mv;
304         x = (x ^ t) | (t >> (1 << i)); // compress x
305         mk = mk & ~mp;
306     }
307 
308     return x;
309 #endif
310 }
311 
312 static really_inline
expand32(u32 x,u32 m)313 u32 expand32(u32 x, u32 m) {
314 #if defined(HAVE_BMI2)
315     // BMI2 has a single instruction for this operation.
316     return _pdep_u32(x, m);
317 #else
318 
319     // Return zero quickly on trivial cases
320     if (!x || !m) {
321         return 0;
322     }
323 
324     u32 m0, mk, mp, mv, t;
325     u32 array[5];
326 
327     m0 = m; // save original mask
328     mk = ~m << 1; // we will count 0's to right
329 
330     for (int i = 0; i < 5; i++) {
331         mp = mk ^ (mk << 1); // parallel suffix
332         mp = mp ^ (mp << 2);
333         mp = mp ^ (mp << 4);
334         mp = mp ^ (mp << 8);
335         mp = mp ^ (mp << 16);
336         mv = mp & m; // bits to move
337         array[i] = mv;
338         m = (m ^ mv) | (mv >> (1 << i)); // compress m
339         mk = mk & ~mp;
340     }
341 
342     for (int i = 4; i >= 0; i--) {
343         mv = array[i];
344         t = x << (1 << i);
345         x = (x & ~mv) | (t & mv);
346     }
347 
348     return x & m0; // clear out extraneous bits
349 #endif
350 }
351 
352 static really_inline
expand64(u64a x,u64a m)353 u64a expand64(u64a x, u64a m) {
354 #if defined(ARCH_X86_64) && defined(HAVE_BMI2)
355     // BMI2 has a single instruction for this operation.
356     return _pdep_u64(x, m);
357 #else
358 
359     // Return zero quickly on trivial cases
360     if (!x || !m) {
361         return 0;
362     }
363 
364     u64a m0, mk, mp, mv, t;
365     u64a array[6];
366 
367     m0 = m; // save original mask
368     mk = ~m << 1; // we will count 0's to right
369 
370     for (int i = 0; i < 6; i++) {
371         mp = mk ^ (mk << 1); // parallel suffix
372         mp = mp ^ (mp << 2);
373         mp = mp ^ (mp << 4);
374         mp = mp ^ (mp << 8);
375         mp = mp ^ (mp << 16);
376         mp = mp ^ (mp << 32);
377         mv = mp & m; // bits to move
378         array[i] = mv;
379         m = (m ^ mv) | (mv >> (1 << i)); // compress m
380         mk = mk & ~mp;
381     }
382 
383     for (int i = 5; i >= 0; i--) {
384         mv = array[i];
385         t = x << (1 << i);
386         x = (x & ~mv) | (t & mv);
387     }
388 
389     return x & m0; // clear out extraneous bits
390 #endif
391 }
392 
393 
394 /* returns the first set bit after begin (if not ~0U). If no bit is set after
395  * begin returns ~0U
396  */
397 static really_inline
bf64_iterate(u64a bitfield,u32 begin)398 u32 bf64_iterate(u64a bitfield, u32 begin) {
399     if (begin != ~0U) {
400         /* switch off all bits at or below begin. Note: not legal to shift by
401          * by size of the datatype or larger. */
402         assert(begin <= 63);
403         bitfield &= ~((2ULL << begin) - 1);
404     }
405 
406     if (!bitfield) {
407         return ~0U;
408     }
409 
410     return ctz64(bitfield);
411 }
412 
413 static really_inline
bf64_set(u64a * bitfield,u32 i)414 char bf64_set(u64a *bitfield, u32 i) {
415     assert(i < 64);
416     u64a mask = 1ULL << i;
417     char was_set = !!(*bitfield & mask);
418     *bitfield |= mask;
419 
420     return was_set;
421 }
422 
423 static really_inline
bf64_unset(u64a * bitfield,u32 i)424 void bf64_unset(u64a *bitfield, u32 i) {
425     assert(i < 64);
426     *bitfield &= ~(1ULL << i);
427 }
428 
429 static really_inline
rank_in_mask32(u32 mask,u32 bit)430 u32 rank_in_mask32(u32 mask, u32 bit) {
431     assert(bit < sizeof(u32) * 8);
432     assert(mask & (u32)(1U << bit));
433     mask &= (u32)(1U << bit) - 1;
434     return popcount32(mask);
435 }
436 
437 static really_inline
rank_in_mask64(u64a mask,u32 bit)438 u32 rank_in_mask64(u64a mask, u32 bit) {
439     assert(bit < sizeof(u64a) * 8);
440     assert(mask & (u64a)(1ULL << bit));
441     mask &= (u64a)(1ULL << bit) - 1;
442     return popcount64(mask);
443 }
444 
445 static really_inline
pext32(u32 x,u32 mask)446 u32 pext32(u32 x, u32 mask) {
447 #if defined(HAVE_BMI2)
448     // Intel BMI2 can do this operation in one instruction.
449     return _pext_u32(x, mask);
450 #else
451 
452     u32 result = 0, num = 1;
453     while (mask != 0) {
454         u32 bit = findAndClearLSB_32(&mask);
455         if (x & (1U << bit)) {
456             assert(num != 0); // more than 32 bits!
457             result |= num;
458         }
459         num <<= 1;
460     }
461     return result;
462 #endif
463 }
464 
465 static really_inline
pext64(u64a x,u64a mask)466 u64a pext64(u64a x, u64a mask) {
467 #if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
468     // Intel BMI2 can do this operation in one instruction.
469     return _pext_u64(x, mask);
470 #else
471 
472     u32 result = 0, num = 1;
473     while (mask != 0) {
474         u32 bit = findAndClearLSB_64(&mask);
475         if (x & (1ULL << bit)) {
476             assert(num != 0); // more than 32 bits!
477             result |= num;
478         }
479         num <<= 1;
480     }
481     return result;
482 #endif
483 }
484 
485 #if defined(HAVE_BMI2) && defined(ARCH_64_BIT)
486 static really_inline
pdep64(u64a x,u64a mask)487 u64a pdep64(u64a x, u64a mask) {
488     return _pdep_u64(x, mask);
489 }
490 #endif
491 
492 #endif // BITUTILS_H
493