1 /* mpn_divexact_by3c -- mpn exact division by 3. 2 3 Copyright 2000, 2001, 2002, 2003, 2008 Free Software Foundation, Inc. 4 5 This file is part of the GNU MP Library. 6 7 The GNU MP Library is free software; you can redistribute it and/or modify 8 it under the terms of the GNU Lesser General Public License as published by 9 the Free Software Foundation; either version 3 of the License, or (at your 10 option) any later version. 11 12 The GNU MP Library is distributed in the hope that it will be useful, but 13 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 14 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 15 License for more details. 16 17 You should have received a copy of the GNU Lesser General Public License 18 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 19 20 #include "gmp.h" 21 #include "gmp-impl.h" 22 23 #if DIVEXACT_BY3_METHOD == 0 24 25 mp_limb_t 26 mpn_divexact_by3c (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_limb_t c) 27 { 28 mp_limb_t r; 29 r = mpn_bdiv_dbm1c (rp, up, un, GMP_NUMB_MASK / 3, GMP_NUMB_MASK / 3 * c); 30 31 /* Possible bdiv_dbm1 return values are C * (GMP_NUMB_MASK / 3), 0 <= C < 3. 32 We want to return C. We compute the remainder mod 4 and notice that the 33 inverse of (2^(2k)-1)/3 mod 4 is 1. */ 34 return r & 3; 35 } 36 37 #endif 38 39 #if DIVEXACT_BY3_METHOD == 1 40 41 /* The algorithm here is basically the same as mpn_divexact_1, as described 42 in the manual. Namely at each step q = (src[i]-c)*inverse, and new c = 43 borrow(src[i]-c) + high(divisor*q). But because the divisor is just 3, 44 high(divisor*q) can be determined with two comparisons instead of a 45 multiply. 46 47 The "c += ..."s add the high limb of 3*l to c. That high limb will be 0, 48 1 or 2. Doing two separate "+="s seems to give better code on gcc (as of 49 2.95.2 at least). 50 51 It will be noted that the new c is formed by adding three values each 0 52 or 1. But the total is only 0, 1 or 2. When the subtraction src[i]-c 53 causes a borrow, that leaves a limb value of either 0xFF...FF or 54 0xFF...FE. The multiply by MODLIMB_INVERSE_3 gives 0x55...55 or 55 0xAA...AA respectively, and in those cases high(3*q) is only 0 or 1 56 respectively, hence a total of no more than 2. 57 58 Alternatives: 59 60 This implementation has each multiply on the dependent chain, due to 61 "l=s-c". See below for alternative code which avoids that. */ 62 63 mp_limb_t 64 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t c) 65 { 66 mp_limb_t l, q, s; 67 mp_size_t i; 68 69 ASSERT (un >= 1); 70 ASSERT (c == 0 || c == 1 || c == 2); 71 ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); 72 73 i = 0; 74 do 75 { 76 s = up[i]; 77 SUBC_LIMB (c, l, s, c); 78 79 q = (l * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; 80 rp[i] = q; 81 82 c += (q >= GMP_NUMB_CEIL_MAX_DIV3); 83 c += (q >= GMP_NUMB_CEIL_2MAX_DIV3); 84 } 85 while (++i < un); 86 87 ASSERT (c == 0 || c == 1 || c == 2); 88 return c; 89 } 90 91 92 #endif 93 94 #if DIVEXACT_BY3_METHOD == 2 95 96 /* The following alternative code re-arranges the quotient calculation from 97 (src[i]-c)*inverse to instead 98 99 q = src[i]*inverse - c*inverse 100 101 thereby allowing src[i]*inverse to be scheduled back as far as desired, 102 making full use of multiplier throughput and leaving just some carry 103 handing on the dependent chain. 104 105 The carry handling consists of determining the c for the next iteration. 106 This is the same as described above, namely look for any borrow from 107 src[i]-c, and at the high of 3*q. 108 109 high(3*q) is done with two comparisons as above (in c2 and c3). The 110 borrow from src[i]-c is incorporated into those by noting that if there's 111 a carry then then we have src[i]-c == 0xFF..FF or 0xFF..FE, in turn 112 giving q = 0x55..55 or 0xAA..AA. Adding 1 to either of those q values is 113 enough to make high(3*q) come out 1 bigger, as required. 114 115 l = -c*inverse is calculated at the same time as c, since for most chips 116 it can be more conveniently derived from separate c1/c2/c3 values than 117 from a combined c equal to 0, 1 or 2. 118 119 The net effect is that with good pipelining this loop should be able to 120 run at perhaps 4 cycles/limb, depending on available execute resources 121 etc. 122 123 Usage: 124 125 This code is not used by default, since we really can't rely on the 126 compiler generating a good software pipeline, nor on such an approach 127 even being worthwhile on all CPUs. 128 129 Itanium is one chip where this algorithm helps though, see 130 mpn/ia64/diveby3.asm. */ 131 132 mp_limb_t 133 mpn_divexact_by3c (mp_ptr restrict rp, mp_srcptr restrict up, mp_size_t un, mp_limb_t cy) 134 { 135 mp_limb_t s, sm, cl, q, qx, c2, c3; 136 mp_size_t i; 137 138 ASSERT (un >= 1); 139 ASSERT (cy == 0 || cy == 1 || cy == 2); 140 ASSERT (MPN_SAME_OR_SEPARATE_P (rp, up, un)); 141 142 cl = cy == 0 ? 0 : cy == 1 ? -MODLIMB_INVERSE_3 : -2*MODLIMB_INVERSE_3; 143 144 for (i = 0; i < un; i++) 145 { 146 s = up[i]; 147 sm = (s * MODLIMB_INVERSE_3) & GMP_NUMB_MASK; 148 149 q = (cl + sm) & GMP_NUMB_MASK; 150 rp[i] = q; 151 qx = q + (s < cy); 152 153 c2 = qx >= GMP_NUMB_CEIL_MAX_DIV3; 154 c3 = qx >= GMP_NUMB_CEIL_2MAX_DIV3 ; 155 156 cy = c2 + c3; 157 cl = (-c2 & -MODLIMB_INVERSE_3) + (-c3 & -MODLIMB_INVERSE_3); 158 } 159 160 return cy; 161 } 162 163 #endif 164