1 /* 2 * Utility compute operations used by translated code. 3 * 4 * Copyright (c) 2007 Thiemo Seufer 5 * Copyright (c) 2007 Jocelyn Mayer 6 * 7 * Permission is hereby granted, free of charge, to any person obtaining a copy 8 * of this software and associated documentation files (the "Software"), to deal 9 * in the Software without restriction, including without limitation the rights 10 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 11 * copies of the Software, and to permit persons to whom the Software is 12 * furnished to do so, subject to the following conditions: 13 * 14 * The above copyright notice and this permission notice shall be included in 15 * all copies or substantial portions of the Software. 16 * 17 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 18 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 19 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 20 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 21 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 22 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 23 * THE SOFTWARE. 24 */ 25 26 /* Portions of this work are licensed under the terms of the GNU GPL, 27 * version 2 or later. See the COPYING file in the top-level directory. 28 */ 29 30 #ifndef HOST_UTILS_H 31 #define HOST_UTILS_H 32 33 #include "qemu/bswap.h" 34 #include "qemu/int128.h" 35 36 #ifdef CONFIG_INT128 37 static inline void mulu64(uint64_t *plow, uint64_t *phigh, 38 uint64_t a, uint64_t b) 39 { 40 __uint128_t r = (__uint128_t)a * b; 41 *plow = r; 42 *phigh = r >> 64; 43 } 44 45 static inline void muls64(uint64_t *plow, uint64_t *phigh, 46 int64_t a, int64_t b) 47 { 48 __int128_t r = (__int128_t)a * b; 49 *plow = r; 50 *phigh = r >> 64; 51 } 52 53 /* compute with 96 bit intermediate result: (a*b)/c */ 54 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) 55 { 56 return (__int128_t)a * b / c; 57 } 58 59 static inline uint64_t divu128(uint64_t *plow, uint64_t *phigh, 60 uint64_t divisor) 61 { 62 __uint128_t dividend = ((__uint128_t)*phigh << 64) | *plow; 63 __uint128_t result = dividend / divisor; 64 65 *plow = result; 66 *phigh = result >> 64; 67 return dividend % divisor; 68 } 69 70 static inline int64_t divs128(uint64_t *plow, int64_t *phigh, 71 int64_t divisor) 72 { 73 __int128_t dividend = ((__int128_t)*phigh << 64) | *plow; 74 __int128_t result = dividend / divisor; 75 76 *plow = result; 77 *phigh = result >> 64; 78 return dividend % divisor; 79 } 80 #else 81 void muls64(uint64_t *plow, uint64_t *phigh, int64_t a, int64_t b); 82 void mulu64(uint64_t *plow, uint64_t *phigh, uint64_t a, uint64_t b); 83 uint64_t divu128(uint64_t *plow, uint64_t *phigh, uint64_t divisor); 84 int64_t divs128(uint64_t *plow, int64_t *phigh, int64_t divisor); 85 86 static inline uint64_t muldiv64(uint64_t a, uint32_t b, uint32_t c) 87 { 88 union { 89 uint64_t ll; 90 struct { 91 #if HOST_BIG_ENDIAN 92 uint32_t high, low; 93 #else 94 uint32_t low, high; 95 #endif 96 } l; 97 } u, res; 98 uint64_t rl, rh; 99 100 u.ll = a; 101 rl = (uint64_t)u.l.low * (uint64_t)b; 102 rh = (uint64_t)u.l.high * (uint64_t)b; 103 rh += (rl >> 32); 104 res.l.high = rh / c; 105 res.l.low = (((rh % c) << 32) + (rl & 0xffffffff)) / c; 106 return res.ll; 107 } 108 #endif 109 110 /** 111 * clz8 - count leading zeros in a 8-bit value. 112 * @val: The value to search 113 * 114 * Returns 8 if the value is zero. Note that the GCC builtin is 115 * undefined if the value is zero. 116 * 117 * Note that the GCC builtin will upcast its argument to an `unsigned int` 118 * so this function subtracts off the number of prepended zeroes. 119 */ 120 static inline int clz8(uint8_t val) 121 { 122 return val ? __builtin_clz(val) - 24 : 8; 123 } 124 125 /** 126 * clz16 - count leading zeros in a 16-bit value. 127 * @val: The value to search 128 * 129 * Returns 16 if the value is zero. Note that the GCC builtin is 130 * undefined if the value is zero. 131 * 132 * Note that the GCC builtin will upcast its argument to an `unsigned int` 133 * so this function subtracts off the number of prepended zeroes. 134 */ 135 static inline int clz16(uint16_t val) 136 { 137 return val ? __builtin_clz(val) - 16 : 16; 138 } 139 140 /** 141 * clz32 - count leading zeros in a 32-bit value. 142 * @val: The value to search 143 * 144 * Returns 32 if the value is zero. Note that the GCC builtin is 145 * undefined if the value is zero. 146 */ 147 static inline int clz32(uint32_t val) 148 { 149 return val ? __builtin_clz(val) : 32; 150 } 151 152 /** 153 * clo32 - count leading ones in a 32-bit value. 154 * @val: The value to search 155 * 156 * Returns 32 if the value is -1. 157 */ 158 static inline int clo32(uint32_t val) 159 { 160 return clz32(~val); 161 } 162 163 /** 164 * clz64 - count leading zeros in a 64-bit value. 165 * @val: The value to search 166 * 167 * Returns 64 if the value is zero. Note that the GCC builtin is 168 * undefined if the value is zero. 169 */ 170 static inline int clz64(uint64_t val) 171 { 172 return val ? __builtin_clzll(val) : 64; 173 } 174 175 /** 176 * clo64 - count leading ones in a 64-bit value. 177 * @val: The value to search 178 * 179 * Returns 64 if the value is -1. 180 */ 181 static inline int clo64(uint64_t val) 182 { 183 return clz64(~val); 184 } 185 186 /** 187 * ctz8 - count trailing zeros in a 8-bit value. 188 * @val: The value to search 189 * 190 * Returns 8 if the value is zero. Note that the GCC builtin is 191 * undefined if the value is zero. 192 */ 193 static inline int ctz8(uint8_t val) 194 { 195 return val ? __builtin_ctz(val) : 8; 196 } 197 198 /** 199 * ctz16 - count trailing zeros in a 16-bit value. 200 * @val: The value to search 201 * 202 * Returns 16 if the value is zero. Note that the GCC builtin is 203 * undefined if the value is zero. 204 */ 205 static inline int ctz16(uint16_t val) 206 { 207 return val ? __builtin_ctz(val) : 16; 208 } 209 210 /** 211 * ctz32 - count trailing zeros in a 32-bit value. 212 * @val: The value to search 213 * 214 * Returns 32 if the value is zero. Note that the GCC builtin is 215 * undefined if the value is zero. 216 */ 217 static inline int ctz32(uint32_t val) 218 { 219 return val ? __builtin_ctz(val) : 32; 220 } 221 222 /** 223 * cto32 - count trailing ones in a 32-bit value. 224 * @val: The value to search 225 * 226 * Returns 32 if the value is -1. 227 */ 228 static inline int cto32(uint32_t val) 229 { 230 return ctz32(~val); 231 } 232 233 /** 234 * ctz64 - count trailing zeros in a 64-bit value. 235 * @val: The value to search 236 * 237 * Returns 64 if the value is zero. Note that the GCC builtin is 238 * undefined if the value is zero. 239 */ 240 static inline int ctz64(uint64_t val) 241 { 242 return val ? __builtin_ctzll(val) : 64; 243 } 244 245 /** 246 * cto64 - count trailing ones in a 64-bit value. 247 * @val: The value to search 248 * 249 * Returns 64 if the value is -1. 250 */ 251 static inline int cto64(uint64_t val) 252 { 253 return ctz64(~val); 254 } 255 256 /** 257 * clrsb32 - count leading redundant sign bits in a 32-bit value. 258 * @val: The value to search 259 * 260 * Returns the number of bits following the sign bit that are equal to it. 261 * No special cases; output range is [0-31]. 262 */ 263 static inline int clrsb32(uint32_t val) 264 { 265 #if __has_builtin(__builtin_clrsb) || !defined(__clang__) 266 return __builtin_clrsb(val); 267 #else 268 return clz32(val ^ ((int32_t)val >> 1)) - 1; 269 #endif 270 } 271 272 /** 273 * clrsb64 - count leading redundant sign bits in a 64-bit value. 274 * @val: The value to search 275 * 276 * Returns the number of bits following the sign bit that are equal to it. 277 * No special cases; output range is [0-63]. 278 */ 279 static inline int clrsb64(uint64_t val) 280 { 281 #if __has_builtin(__builtin_clrsbll) || !defined(__clang__) 282 return __builtin_clrsbll(val); 283 #else 284 return clz64(val ^ ((int64_t)val >> 1)) - 1; 285 #endif 286 } 287 288 /** 289 * ctpop8 - count the population of one bits in an 8-bit value. 290 * @val: The value to search 291 */ 292 static inline int ctpop8(uint8_t val) 293 { 294 return __builtin_popcount(val); 295 } 296 297 /** 298 * ctpop16 - count the population of one bits in a 16-bit value. 299 * @val: The value to search 300 */ 301 static inline int ctpop16(uint16_t val) 302 { 303 return __builtin_popcount(val); 304 } 305 306 /** 307 * ctpop32 - count the population of one bits in a 32-bit value. 308 * @val: The value to search 309 */ 310 static inline int ctpop32(uint32_t val) 311 { 312 return __builtin_popcount(val); 313 } 314 315 /** 316 * ctpop64 - count the population of one bits in a 64-bit value. 317 * @val: The value to search 318 */ 319 static inline int ctpop64(uint64_t val) 320 { 321 return __builtin_popcountll(val); 322 } 323 324 /** 325 * revbit8 - reverse the bits in an 8-bit value. 326 * @x: The value to modify. 327 */ 328 static inline uint8_t revbit8(uint8_t x) 329 { 330 #if __has_builtin(__builtin_bitreverse8) 331 return __builtin_bitreverse8(x); 332 #else 333 /* Assign the correct nibble position. */ 334 x = ((x & 0xf0) >> 4) 335 | ((x & 0x0f) << 4); 336 /* Assign the correct bit position. */ 337 x = ((x & 0x88) >> 3) 338 | ((x & 0x44) >> 1) 339 | ((x & 0x22) << 1) 340 | ((x & 0x11) << 3); 341 return x; 342 #endif 343 } 344 345 /** 346 * revbit16 - reverse the bits in a 16-bit value. 347 * @x: The value to modify. 348 */ 349 static inline uint16_t revbit16(uint16_t x) 350 { 351 #if __has_builtin(__builtin_bitreverse16) 352 return __builtin_bitreverse16(x); 353 #else 354 /* Assign the correct byte position. */ 355 x = bswap16(x); 356 /* Assign the correct nibble position. */ 357 x = ((x & 0xf0f0) >> 4) 358 | ((x & 0x0f0f) << 4); 359 /* Assign the correct bit position. */ 360 x = ((x & 0x8888) >> 3) 361 | ((x & 0x4444) >> 1) 362 | ((x & 0x2222) << 1) 363 | ((x & 0x1111) << 3); 364 return x; 365 #endif 366 } 367 368 /** 369 * revbit32 - reverse the bits in a 32-bit value. 370 * @x: The value to modify. 371 */ 372 static inline uint32_t revbit32(uint32_t x) 373 { 374 #if __has_builtin(__builtin_bitreverse32) 375 return __builtin_bitreverse32(x); 376 #else 377 /* Assign the correct byte position. */ 378 x = bswap32(x); 379 /* Assign the correct nibble position. */ 380 x = ((x & 0xf0f0f0f0u) >> 4) 381 | ((x & 0x0f0f0f0fu) << 4); 382 /* Assign the correct bit position. */ 383 x = ((x & 0x88888888u) >> 3) 384 | ((x & 0x44444444u) >> 1) 385 | ((x & 0x22222222u) << 1) 386 | ((x & 0x11111111u) << 3); 387 return x; 388 #endif 389 } 390 391 /** 392 * revbit64 - reverse the bits in a 64-bit value. 393 * @x: The value to modify. 394 */ 395 static inline uint64_t revbit64(uint64_t x) 396 { 397 #if __has_builtin(__builtin_bitreverse64) 398 return __builtin_bitreverse64(x); 399 #else 400 /* Assign the correct byte position. */ 401 x = bswap64(x); 402 /* Assign the correct nibble position. */ 403 x = ((x & 0xf0f0f0f0f0f0f0f0ull) >> 4) 404 | ((x & 0x0f0f0f0f0f0f0f0full) << 4); 405 /* Assign the correct bit position. */ 406 x = ((x & 0x8888888888888888ull) >> 3) 407 | ((x & 0x4444444444444444ull) >> 1) 408 | ((x & 0x2222222222222222ull) << 1) 409 | ((x & 0x1111111111111111ull) << 3); 410 return x; 411 #endif 412 } 413 414 /** 415 * Return the absolute value of a 64-bit integer as an unsigned 64-bit value 416 */ 417 static inline uint64_t uabs64(int64_t v) 418 { 419 return v < 0 ? -v : v; 420 } 421 422 /** 423 * sadd32_overflow - addition with overflow indication 424 * @x, @y: addends 425 * @ret: Output for sum 426 * 427 * Computes *@ret = @x + @y, and returns true if and only if that 428 * value has been truncated. 429 */ 430 static inline bool sadd32_overflow(int32_t x, int32_t y, int32_t *ret) 431 { 432 return __builtin_add_overflow(x, y, ret); 433 } 434 435 /** 436 * sadd64_overflow - addition with overflow indication 437 * @x, @y: addends 438 * @ret: Output for sum 439 * 440 * Computes *@ret = @x + @y, and returns true if and only if that 441 * value has been truncated. 442 */ 443 static inline bool sadd64_overflow(int64_t x, int64_t y, int64_t *ret) 444 { 445 return __builtin_add_overflow(x, y, ret); 446 } 447 448 /** 449 * uadd32_overflow - addition with overflow indication 450 * @x, @y: addends 451 * @ret: Output for sum 452 * 453 * Computes *@ret = @x + @y, and returns true if and only if that 454 * value has been truncated. 455 */ 456 static inline bool uadd32_overflow(uint32_t x, uint32_t y, uint32_t *ret) 457 { 458 return __builtin_add_overflow(x, y, ret); 459 } 460 461 /** 462 * uadd64_overflow - addition with overflow indication 463 * @x, @y: addends 464 * @ret: Output for sum 465 * 466 * Computes *@ret = @x + @y, and returns true if and only if that 467 * value has been truncated. 468 */ 469 static inline bool uadd64_overflow(uint64_t x, uint64_t y, uint64_t *ret) 470 { 471 return __builtin_add_overflow(x, y, ret); 472 } 473 474 /** 475 * ssub32_overflow - subtraction with overflow indication 476 * @x: Minuend 477 * @y: Subtrahend 478 * @ret: Output for difference 479 * 480 * Computes *@ret = @x - @y, and returns true if and only if that 481 * value has been truncated. 482 */ 483 static inline bool ssub32_overflow(int32_t x, int32_t y, int32_t *ret) 484 { 485 return __builtin_sub_overflow(x, y, ret); 486 } 487 488 /** 489 * ssub64_overflow - subtraction with overflow indication 490 * @x: Minuend 491 * @y: Subtrahend 492 * @ret: Output for sum 493 * 494 * Computes *@ret = @x - @y, and returns true if and only if that 495 * value has been truncated. 496 */ 497 static inline bool ssub64_overflow(int64_t x, int64_t y, int64_t *ret) 498 { 499 return __builtin_sub_overflow(x, y, ret); 500 } 501 502 /** 503 * usub32_overflow - subtraction with overflow indication 504 * @x: Minuend 505 * @y: Subtrahend 506 * @ret: Output for sum 507 * 508 * Computes *@ret = @x - @y, and returns true if and only if that 509 * value has been truncated. 510 */ 511 static inline bool usub32_overflow(uint32_t x, uint32_t y, uint32_t *ret) 512 { 513 return __builtin_sub_overflow(x, y, ret); 514 } 515 516 /** 517 * usub64_overflow - subtraction with overflow indication 518 * @x: Minuend 519 * @y: Subtrahend 520 * @ret: Output for sum 521 * 522 * Computes *@ret = @x - @y, and returns true if and only if that 523 * value has been truncated. 524 */ 525 static inline bool usub64_overflow(uint64_t x, uint64_t y, uint64_t *ret) 526 { 527 return __builtin_sub_overflow(x, y, ret); 528 } 529 530 /** 531 * smul32_overflow - multiplication with overflow indication 532 * @x, @y: Input multipliers 533 * @ret: Output for product 534 * 535 * Computes *@ret = @x * @y, and returns true if and only if that 536 * value has been truncated. 537 */ 538 static inline bool smul32_overflow(int32_t x, int32_t y, int32_t *ret) 539 { 540 return __builtin_mul_overflow(x, y, ret); 541 } 542 543 /** 544 * smul64_overflow - multiplication with overflow indication 545 * @x, @y: Input multipliers 546 * @ret: Output for product 547 * 548 * Computes *@ret = @x * @y, and returns true if and only if that 549 * value has been truncated. 550 */ 551 static inline bool smul64_overflow(int64_t x, int64_t y, int64_t *ret) 552 { 553 return __builtin_mul_overflow(x, y, ret); 554 } 555 556 /** 557 * umul32_overflow - multiplication with overflow indication 558 * @x, @y: Input multipliers 559 * @ret: Output for product 560 * 561 * Computes *@ret = @x * @y, and returns true if and only if that 562 * value has been truncated. 563 */ 564 static inline bool umul32_overflow(uint32_t x, uint32_t y, uint32_t *ret) 565 { 566 return __builtin_mul_overflow(x, y, ret); 567 } 568 569 /** 570 * umul64_overflow - multiplication with overflow indication 571 * @x, @y: Input multipliers 572 * @ret: Output for product 573 * 574 * Computes *@ret = @x * @y, and returns true if and only if that 575 * value has been truncated. 576 */ 577 static inline bool umul64_overflow(uint64_t x, uint64_t y, uint64_t *ret) 578 { 579 return __builtin_mul_overflow(x, y, ret); 580 } 581 582 /* 583 * Unsigned 128x64 multiplication. 584 * Returns true if the result got truncated to 128 bits. 585 * Otherwise, returns false and the multiplication result via plow and phigh. 586 */ 587 static inline bool mulu128(uint64_t *plow, uint64_t *phigh, uint64_t factor) 588 { 589 #if defined(CONFIG_INT128) 590 bool res; 591 __uint128_t r; 592 __uint128_t f = ((__uint128_t)*phigh << 64) | *plow; 593 res = __builtin_mul_overflow(f, factor, &r); 594 595 *plow = r; 596 *phigh = r >> 64; 597 598 return res; 599 #else 600 uint64_t dhi = *phigh; 601 uint64_t dlo = *plow; 602 uint64_t ahi; 603 uint64_t blo, bhi; 604 605 if (dhi == 0) { 606 mulu64(plow, phigh, dlo, factor); 607 return false; 608 } 609 610 mulu64(plow, &ahi, dlo, factor); 611 mulu64(&blo, &bhi, dhi, factor); 612 613 return uadd64_overflow(ahi, blo, phigh) || bhi != 0; 614 #endif 615 } 616 617 /** 618 * uadd64_carry - addition with carry-in and carry-out 619 * @x, @y: addends 620 * @pcarry: in-out carry value 621 * 622 * Computes @x + @y + *@pcarry, placing the carry-out back 623 * into *@pcarry and returning the 64-bit sum. 624 */ 625 static inline uint64_t uadd64_carry(uint64_t x, uint64_t y, bool *pcarry) 626 { 627 #if __has_builtin(__builtin_addcll) 628 unsigned long long c = *pcarry; 629 x = __builtin_addcll(x, y, c, &c); 630 *pcarry = c & 1; 631 return x; 632 #else 633 bool c = *pcarry; 634 /* This is clang's internal expansion of __builtin_addc. */ 635 c = uadd64_overflow(x, c, &x); 636 c |= uadd64_overflow(x, y, &x); 637 *pcarry = c; 638 return x; 639 #endif 640 } 641 642 /** 643 * usub64_borrow - subtraction with borrow-in and borrow-out 644 * @x, @y: addends 645 * @pborrow: in-out borrow value 646 * 647 * Computes @x - @y - *@pborrow, placing the borrow-out back 648 * into *@pborrow and returning the 64-bit sum. 649 */ 650 static inline uint64_t usub64_borrow(uint64_t x, uint64_t y, bool *pborrow) 651 { 652 #if __has_builtin(__builtin_subcll) 653 unsigned long long b = *pborrow; 654 x = __builtin_subcll(x, y, b, &b); 655 *pborrow = b & 1; 656 return x; 657 #else 658 bool b = *pborrow; 659 b = usub64_overflow(x, b, &x); 660 b |= usub64_overflow(x, y, &x); 661 *pborrow = b; 662 return x; 663 #endif 664 } 665 666 /* Host type specific sizes of these routines. */ 667 668 #if ULONG_MAX == UINT32_MAX 669 # define clzl clz32 670 # define ctzl ctz32 671 # define clol clo32 672 # define ctol cto32 673 # define ctpopl ctpop32 674 # define revbitl revbit32 675 #elif ULONG_MAX == UINT64_MAX 676 # define clzl clz64 677 # define ctzl ctz64 678 # define clol clo64 679 # define ctol cto64 680 # define ctpopl ctpop64 681 # define revbitl revbit64 682 #else 683 # error Unknown sizeof long 684 #endif 685 686 static inline bool is_power_of_2(uint64_t value) 687 { 688 if (!value) { 689 return false; 690 } 691 692 return !(value & (value - 1)); 693 } 694 695 /** 696 * Return @value rounded down to the nearest power of two or zero. 697 */ 698 static inline uint64_t pow2floor(uint64_t value) 699 { 700 if (!value) { 701 /* Avoid undefined shift by 64 */ 702 return 0; 703 } 704 return 0x8000000000000000ull >> clz64(value); 705 } 706 707 /* 708 * Return @value rounded up to the nearest power of two modulo 2^64. 709 * This is *zero* for @value > 2^63, so be careful. 710 */ 711 static inline uint64_t pow2ceil(uint64_t value) 712 { 713 int n = clz64(value - 1); 714 715 if (!n) { 716 /* 717 * @value - 1 has no leading zeroes, thus @value - 1 >= 2^63 718 * Therefore, either @value == 0 or @value > 2^63. 719 * If it's 0, return 1, else return 0. 720 */ 721 return !value; 722 } 723 return 0x8000000000000000ull >> (n - 1); 724 } 725 726 static inline uint32_t pow2roundup32(uint32_t x) 727 { 728 x |= (x >> 1); 729 x |= (x >> 2); 730 x |= (x >> 4); 731 x |= (x >> 8); 732 x |= (x >> 16); 733 return x + 1; 734 } 735 736 /** 737 * urshift - 128-bit Unsigned Right Shift. 738 * @plow: in/out - lower 64-bit integer. 739 * @phigh: in/out - higher 64-bit integer. 740 * @shift: in - bytes to shift, between 0 and 127. 741 * 742 * Result is zero-extended and stored in plow/phigh, which are 743 * input/output variables. Shift values outside the range will 744 * be mod to 128. In other words, the caller is responsible to 745 * verify/assert both the shift range and plow/phigh pointers. 746 */ 747 void urshift(uint64_t *plow, uint64_t *phigh, int32_t shift); 748 749 /** 750 * ulshift - 128-bit Unsigned Left Shift. 751 * @plow: in/out - lower 64-bit integer. 752 * @phigh: in/out - higher 64-bit integer. 753 * @shift: in - bytes to shift, between 0 and 127. 754 * @overflow: out - true if any 1-bit is shifted out. 755 * 756 * Result is zero-extended and stored in plow/phigh, which are 757 * input/output variables. Shift values outside the range will 758 * be mod to 128. In other words, the caller is responsible to 759 * verify/assert both the shift range and plow/phigh pointers. 760 */ 761 void ulshift(uint64_t *plow, uint64_t *phigh, int32_t shift, bool *overflow); 762 763 /* From the GNU Multi Precision Library - longlong.h __udiv_qrnnd 764 * (https://gmplib.org/repo/gmp/file/tip/longlong.h) 765 * 766 * Licensed under the GPLv2/LGPLv3 767 */ 768 static inline uint64_t udiv_qrnnd(uint64_t *r, uint64_t n1, 769 uint64_t n0, uint64_t d) 770 { 771 #if defined(__x86_64__) 772 uint64_t q; 773 asm("divq %4" : "=a"(q), "=d"(*r) : "0"(n0), "1"(n1), "rm"(d)); 774 return q; 775 #elif defined(__s390x__) && !defined(__clang__) 776 /* Need to use a TImode type to get an even register pair for DLGR. */ 777 unsigned __int128 n = (unsigned __int128)n1 << 64 | n0; 778 asm("dlgr %0, %1" : "+r"(n) : "r"(d)); 779 *r = n >> 64; 780 return n; 781 #elif defined(_ARCH_PPC64) && defined(_ARCH_PWR7) 782 /* From Power ISA 2.06, programming note for divdeu. */ 783 uint64_t q1, q2, Q, r1, r2, R; 784 asm("divdeu %0,%2,%4; divdu %1,%3,%4" 785 : "=&r"(q1), "=r"(q2) 786 : "r"(n1), "r"(n0), "r"(d)); 787 r1 = -(q1 * d); /* low part of (n1<<64) - (q1 * d) */ 788 r2 = n0 - (q2 * d); 789 Q = q1 + q2; 790 R = r1 + r2; 791 if (R >= d || R < r2) { /* overflow implies R > d */ 792 Q += 1; 793 R -= d; 794 } 795 *r = R; 796 return Q; 797 #else 798 uint64_t d0, d1, q0, q1, r1, r0, m; 799 800 d0 = (uint32_t)d; 801 d1 = d >> 32; 802 803 r1 = n1 % d1; 804 q1 = n1 / d1; 805 m = q1 * d0; 806 r1 = (r1 << 32) | (n0 >> 32); 807 if (r1 < m) { 808 q1 -= 1; 809 r1 += d; 810 if (r1 >= d) { 811 if (r1 < m) { 812 q1 -= 1; 813 r1 += d; 814 } 815 } 816 } 817 r1 -= m; 818 819 r0 = r1 % d1; 820 q0 = r1 / d1; 821 m = q0 * d0; 822 r0 = (r0 << 32) | (uint32_t)n0; 823 if (r0 < m) { 824 q0 -= 1; 825 r0 += d; 826 if (r0 >= d) { 827 if (r0 < m) { 828 q0 -= 1; 829 r0 += d; 830 } 831 } 832 } 833 r0 -= m; 834 835 *r = r0; 836 return (q1 << 32) | q0; 837 #endif 838 } 839 840 Int128 divu256(Int128 *plow, Int128 *phigh, Int128 divisor); 841 Int128 divs256(Int128 *plow, Int128 *phigh, Int128 divisor); 842 #endif 843