1 /* mpn_modexact_1c_odd -- mpn by limb exact division style remainder. 2 3 THE FUNCTIONS IN THIS FILE ARE FOR INTERNAL USE ONLY. THEY'RE ALMOST 4 CERTAIN TO BE SUBJECT TO INCOMPATIBLE CHANGES OR DISAPPEAR COMPLETELY IN 5 FUTURE GNU MP RELEASES. 6 7 Copyright 2000, 2001, 2002, 2003, 2004 Free Software Foundation, Inc. 8 9 This file is part of the GNU MP Library. 10 11 The GNU MP Library is free software; you can redistribute it and/or modify 12 it under the terms of the GNU Lesser General Public License as published by 13 the Free Software Foundation; either version 3 of the License, or (at your 14 option) any later version. 15 16 The GNU MP Library is distributed in the hope that it will be useful, but 17 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 18 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 19 License for more details. 20 21 You should have received a copy of the GNU Lesser General Public License 22 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 23 24 #include "gmp.h" 25 #include "gmp-impl.h" 26 #include "longlong.h" 27 28 29 /* Calculate an r satisfying 30 31 r*B^k + a - c == q*d 32 33 where B=2^GMP_LIMB_BITS, a is {src,size}, k is either size or size-1 34 (the caller won't know which), and q is the quotient (discarded). d must 35 be odd, c can be any limb value. 36 37 If c<d then r will be in the range 0<=r<d, or if c>=d then 0<=r<=d. 38 39 This slightly strange function suits the initial Nx1 reduction for GCDs 40 or Jacobi symbols since the factors of 2 in B^k can be ignored, leaving 41 -r == a mod d (by passing c=0). For a GCD the factor of -1 on r can be 42 ignored, or for the Jacobi symbol it can be accounted for. The function 43 also suits divisibility and congruence testing since if r=0 (or r=d) is 44 obtained then a==c mod d. 45 46 47 r is a bit like the remainder returned by mpn_divexact_by3c, and is the 48 sort of remainder mpn_divexact_1 might return. Like mpn_divexact_by3c, r 49 represents a borrow, since effectively quotient limbs are chosen so that 50 subtracting that multiple of d from src at each step will produce a zero 51 limb. 52 53 A long calculation can be done piece by piece from low to high by passing 54 the return value from one part as the carry parameter to the next part. 55 The effective final k becomes anything between size and size-n, if n 56 pieces are used. 57 58 59 A similar sort of routine could be constructed based on adding multiples 60 of d at each limb, much like redc in mpz_powm does. Subtracting however 61 has a small advantage that when subtracting to cancel out l there's never 62 a borrow into h, whereas using an addition would put a carry into h 63 depending whether l==0 or l!=0. 64 65 66 In terms of efficiency, this function is similar to a mul-by-inverse 67 mpn_mod_1. Both are essentially two multiplies and are best suited to 68 CPUs with low latency multipliers (in comparison to a divide instruction 69 at least.) But modexact has a few less supplementary operations, only 70 needs low part and high part multiplies, and has fewer working quantities 71 (helping CPUs with few registers). 72 73 74 In the main loop it will be noted that the new carry (call it r) is the 75 sum of the high product h and any borrow from l=s-c. If c<d then we will 76 have r<d too, for the following reasons. Let q=l*inverse be the quotient 77 limb, so that q*d = B*h + l, where B=2^GMP_NUMB_BITS. Now if h=d-1 then 78 79 l = q*d - B*(d-1) <= (B-1)*d - B*(d-1) = B-d 80 81 But if l=s-c produces a borrow when c<d, then l>=B-d+1 and hence will 82 never have h=d-1 and so r=h+borrow <= d-1. 83 84 When c>=d, on the other hand, h=d-1 can certainly occur together with a 85 borrow, thereby giving only r<=d, as per the function definition above. 86 87 As a design decision it's left to the caller to check for r=d if it might 88 be passing c>=d. Several applications have c<d initially so the extra 89 test is often unnecessary, for example the GCDs or a plain divisibility 90 d|a test will pass c=0. 91 92 93 The special case for size==1 is so that it can be assumed c<=d in the 94 high<=divisor test at the end. c<=d is only guaranteed after at least 95 one iteration of the main loop. There's also a decent chance one % is 96 faster than a binvert_limb, though that will depend on the processor. 97 98 A CPU specific implementation might want to omit the size==1 code or the 99 high<divisor test. mpn/x86/k6/mode1o.asm for instance finds neither 100 useful. */ 101 102 103 mp_limb_t 104 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, 105 mp_limb_t orig_c) 106 { 107 mp_limb_t s, h, l, inverse, dummy, dmul, ret; 108 mp_limb_t c = orig_c; 109 mp_size_t i; 110 111 ASSERT (size >= 1); 112 ASSERT (d & 1); 113 ASSERT_MPN (src, size); 114 ASSERT_LIMB (d); 115 ASSERT_LIMB (c); 116 117 if (size == 1) 118 { 119 s = src[0]; 120 if (s > c) 121 { 122 l = s-c; 123 h = l % d; 124 if (h != 0) 125 h = d - h; 126 } 127 else 128 { 129 l = c-s; 130 h = l % d; 131 } 132 return h; 133 } 134 135 136 binvert_limb (inverse, d); 137 dmul = d << GMP_NAIL_BITS; 138 139 i = 0; 140 do 141 { 142 s = src[i]; 143 SUBC_LIMB (c, l, s, c); 144 l = (l * inverse) & GMP_NUMB_MASK; 145 umul_ppmm (h, dummy, l, dmul); 146 c += h; 147 } 148 while (++i < size-1); 149 150 151 s = src[i]; 152 if (s <= d) 153 { 154 /* With high<=d the final step can be a subtract and addback. If c==0 155 then the addback will restore to l>=0. If c==d then will get l==d 156 if s==0, but that's ok per the function definition. */ 157 158 l = c - s; 159 if (c < s) 160 l += d; 161 162 ret = l; 163 } 164 else 165 { 166 /* Can't skip a divide, just do the loop code once more. */ 167 168 SUBC_LIMB (c, l, s, c); 169 l = (l * inverse) & GMP_NUMB_MASK; 170 umul_ppmm (h, dummy, l, dmul); 171 c += h; 172 ret = c; 173 } 174 175 ASSERT (orig_c < d ? ret < d : ret <= d); 176 return ret; 177 } 178 179 180 181 #if 0 182 183 /* The following is an alternate form that might shave one cycle on a 184 superscalar processor since it takes c+=h off the dependent chain, 185 leaving just a low product, high product, and a subtract. 186 187 This is for CPU specific implementations to consider. A special case for 188 high<divisor and/or size==1 can be added if desired. 189 190 Notice that c is only ever 0 or 1, since if s-c produces a borrow then 191 x=0xFF..FF and x-h cannot produce a borrow. The c=(x>s) could become 192 c=(x==0xFF..FF) too, if that helped. */ 193 194 mp_limb_t 195 mpn_modexact_1c_odd (mp_srcptr src, mp_size_t size, mp_limb_t d, mp_limb_t h) 196 { 197 mp_limb_t s, x, y, inverse, dummy, dmul, c1, c2; 198 mp_limb_t c = 0; 199 mp_size_t i; 200 201 ASSERT (size >= 1); 202 ASSERT (d & 1); 203 204 binvert_limb (inverse, d); 205 dmul = d << GMP_NAIL_BITS; 206 207 for (i = 0; i < size; i++) 208 { 209 ASSERT (c==0 || c==1); 210 211 s = src[i]; 212 SUBC_LIMB (c1, x, s, c); 213 214 SUBC_LIMB (c2, y, x, h); 215 c = c1 + c2; 216 217 y = (y * inverse) & GMP_NUMB_MASK; 218 umul_ppmm (h, dummy, y, dmul); 219 } 220 221 h += c; 222 return h; 223 } 224 225 #endif 226