1 /* hgcd2.c 2 3 THE FUNCTIONS IN THIS FILE ARE INTERNAL WITH MUTABLE INTERFACES. IT IS ONLY 4 SAFE TO REACH THEM THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 5 GUARANTEED THAT THEY'LL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 6 7 Copyright 1996, 1998, 2000, 2001, 2002, 2003, 2004, 2008 Free Software 8 Foundation, Inc. 9 10 This file is part of the GNU MP Library. 11 12 The GNU MP Library is free software; you can redistribute it and/or modify 13 it under the terms of the GNU Lesser General Public License as published by 14 the Free Software Foundation; either version 3 of the License, or (at your 15 option) any later version. 16 17 The GNU MP Library is distributed in the hope that it will be useful, but 18 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 19 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 20 License for more details. 21 22 You should have received a copy of the GNU Lesser General Public License 23 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 24 25 #include "gmp.h" 26 #include "gmp-impl.h" 27 #include "longlong.h" 28 29 #if GMP_NAIL_BITS == 0 30 31 /* Copied from the old mpn/generic/gcdext.c, and modified slightly to return 32 the remainder. */ 33 34 /* Single-limb division optimized for small quotients. */ 35 static inline mp_limb_t 36 div1 (mp_ptr rp, 37 mp_limb_t n0, 38 mp_limb_t d0) 39 { 40 mp_limb_t q = 0; 41 42 if ((mp_limb_signed_t) n0 < 0) 43 { 44 int cnt; 45 for (cnt = 1; (mp_limb_signed_t) d0 >= 0; cnt++) 46 { 47 d0 = d0 << 1; 48 } 49 50 q = 0; 51 while (cnt) 52 { 53 q <<= 1; 54 if (n0 >= d0) 55 { 56 n0 = n0 - d0; 57 q |= 1; 58 } 59 d0 = d0 >> 1; 60 cnt--; 61 } 62 } 63 else 64 { 65 int cnt; 66 for (cnt = 0; n0 >= d0; cnt++) 67 { 68 d0 = d0 << 1; 69 } 70 71 q = 0; 72 while (cnt) 73 { 74 d0 = d0 >> 1; 75 q <<= 1; 76 if (n0 >= d0) 77 { 78 n0 = n0 - d0; 79 q |= 1; 80 } 81 cnt--; 82 } 83 } 84 *rp = n0; 85 return q; 86 } 87 88 /* Two-limb division optimized for small quotients. */ 89 static inline mp_limb_t 90 div2 (mp_ptr rp, 91 mp_limb_t nh, mp_limb_t nl, 92 mp_limb_t dh, mp_limb_t dl) 93 { 94 mp_limb_t q = 0; 95 96 if ((mp_limb_signed_t) nh < 0) 97 { 98 int cnt; 99 for (cnt = 1; (mp_limb_signed_t) dh >= 0; cnt++) 100 { 101 dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); 102 dl = dl << 1; 103 } 104 105 while (cnt) 106 { 107 q <<= 1; 108 if (nh > dh || (nh == dh && nl >= dl)) 109 { 110 sub_ddmmss (nh, nl, nh, nl, dh, dl); 111 q |= 1; 112 } 113 dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); 114 dh = dh >> 1; 115 cnt--; 116 } 117 } 118 else 119 { 120 int cnt; 121 for (cnt = 0; nh > dh || (nh == dh && nl >= dl); cnt++) 122 { 123 dh = (dh << 1) | (dl >> (GMP_LIMB_BITS - 1)); 124 dl = dl << 1; 125 } 126 127 while (cnt) 128 { 129 dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); 130 dh = dh >> 1; 131 q <<= 1; 132 if (nh > dh || (nh == dh && nl >= dl)) 133 { 134 sub_ddmmss (nh, nl, nh, nl, dh, dl); 135 q |= 1; 136 } 137 cnt--; 138 } 139 } 140 141 rp[0] = nl; 142 rp[1] = nh; 143 144 return q; 145 } 146 147 #if 0 148 /* This div2 uses less branches, but it seems to nevertheless be 149 slightly slower than the above code. */ 150 static inline mp_limb_t 151 div2 (mp_ptr rp, 152 mp_limb_t nh, mp_limb_t nl, 153 mp_limb_t dh, mp_limb_t dl) 154 { 155 mp_limb_t q = 0; 156 int ncnt; 157 int dcnt; 158 159 count_leading_zeros (ncnt, nh); 160 count_leading_zeros (dcnt, dh); 161 dcnt -= ncnt; 162 163 dh = (dh << dcnt) + (-(dcnt > 0) & (dl >> (GMP_LIMB_BITS - dcnt))); 164 dl <<= dcnt; 165 166 do 167 { 168 mp_limb_t bit; 169 q <<= 1; 170 if (UNLIKELY (nh == dh)) 171 bit = (nl >= dl); 172 else 173 bit = (nh > dh); 174 175 q |= bit; 176 177 sub_ddmmss (nh, nl, nh, nl, (-bit) & dh, (-bit) & dl); 178 179 dl = (dh << (GMP_LIMB_BITS - 1)) | (dl >> 1); 180 dh = dh >> 1; 181 } 182 while (dcnt--); 183 184 rp[0] = nl; 185 rp[1] = nh; 186 187 return q; 188 } 189 #endif 190 191 #else /* GMP_NAIL_BITS != 0 */ 192 /* Check all functions for nail support. */ 193 /* hgcd2 should be defined to take inputs including nail bits, and 194 produce a matrix with elements also including nail bits. This is 195 necessary, for the matrix elements to be useful with mpn_mul_1, 196 mpn_addmul_1 and friends. */ 197 #error Not implemented 198 #endif /* GMP_NAIL_BITS != 0 */ 199 200 /* Reduces a,b until |a-b| (almost) fits in one limb + 1 bit. Constructs 201 matrix M. Returns 1 if we make progress, i.e. can perform at least 202 one subtraction. Otherwise returns zero.. */ 203 204 /* FIXME: Possible optimizations: 205 206 The div2 function starts with checking the most significant bit of 207 the numerator. We can maintained normalized operands here, call 208 hgcd with normalized operands only, which should make the code 209 simpler and possibly faster. 210 211 Experiment with table lookups on the most significant bits. 212 213 This function is also a candidate for assembler implementation. 214 */ 215 int 216 mpn_hgcd2 (mp_limb_t ah, mp_limb_t al, mp_limb_t bh, mp_limb_t bl, 217 struct hgcd_matrix1 *M) 218 { 219 mp_limb_t u00, u01, u10, u11; 220 221 if (ah < 2 || bh < 2) 222 return 0; 223 224 if (ah > bh || (ah == bh && al > bl)) 225 { 226 sub_ddmmss (ah, al, ah, al, bh, bl); 227 if (ah < 2) 228 return 0; 229 230 u00 = u01 = u11 = 1; 231 u10 = 0; 232 } 233 else 234 { 235 sub_ddmmss (bh, bl, bh, bl, ah, al); 236 if (bh < 2) 237 return 0; 238 239 u00 = u10 = u11 = 1; 240 u01 = 0; 241 } 242 243 if (ah < bh) 244 goto subtract_a; 245 246 for (;;) 247 { 248 ASSERT (ah >= bh); 249 if (ah == bh) 250 goto done; 251 252 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) 253 { 254 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); 255 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); 256 257 break; 258 } 259 260 /* Subtract a -= q b, and multiply M from the right by (1 q ; 0 261 1), affecting the second column of M. */ 262 ASSERT (ah > bh); 263 sub_ddmmss (ah, al, ah, al, bh, bl); 264 265 if (ah < 2) 266 goto done; 267 268 if (ah <= bh) 269 { 270 /* Use q = 1 */ 271 u01 += u00; 272 u11 += u10; 273 } 274 else 275 { 276 mp_limb_t r[2]; 277 mp_limb_t q = div2 (r, ah, al, bh, bl); 278 al = r[0]; ah = r[1]; 279 if (ah < 2) 280 { 281 /* A is too small, but q is correct. */ 282 u01 += q * u00; 283 u11 += q * u10; 284 goto done; 285 } 286 q++; 287 u01 += q * u00; 288 u11 += q * u10; 289 } 290 subtract_a: 291 ASSERT (bh >= ah); 292 if (ah == bh) 293 goto done; 294 295 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2))) 296 { 297 ah = (ah << (GMP_LIMB_BITS / 2) ) + (al >> (GMP_LIMB_BITS / 2)); 298 bh = (bh << (GMP_LIMB_BITS / 2) ) + (bl >> (GMP_LIMB_BITS / 2)); 299 300 goto subtract_a1; 301 } 302 303 /* Subtract b -= q a, and multiply M from the right by (1 0 ; q 304 1), affecting the first column of M. */ 305 sub_ddmmss (bh, bl, bh, bl, ah, al); 306 307 if (bh < 2) 308 goto done; 309 310 if (bh <= ah) 311 { 312 /* Use q = 1 */ 313 u00 += u01; 314 u10 += u11; 315 } 316 else 317 { 318 mp_limb_t r[2]; 319 mp_limb_t q = div2 (r, bh, bl, ah, al); 320 bl = r[0]; bh = r[1]; 321 if (bh < 2) 322 { 323 /* B is too small, but q is correct. */ 324 u00 += q * u01; 325 u10 += q * u11; 326 goto done; 327 } 328 q++; 329 u00 += q * u01; 330 u10 += q * u11; 331 } 332 } 333 334 /* NOTE: Since we discard the least significant half limb, we don't 335 get a truly maximal M (corresponding to |a - b| < 336 2^{GMP_LIMB_BITS +1}). */ 337 /* Single precision loop */ 338 for (;;) 339 { 340 ASSERT (ah >= bh); 341 if (ah == bh) 342 break; 343 344 ah -= bh; 345 if (ah < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) 346 break; 347 348 if (ah <= bh) 349 { 350 /* Use q = 1 */ 351 u01 += u00; 352 u11 += u10; 353 } 354 else 355 { 356 mp_limb_t r; 357 mp_limb_t q = div1 (&r, ah, bh); 358 ah = r; 359 if (ah < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) 360 { 361 /* A is too small, but q is correct. */ 362 u01 += q * u00; 363 u11 += q * u10; 364 break; 365 } 366 q++; 367 u01 += q * u00; 368 u11 += q * u10; 369 } 370 subtract_a1: 371 ASSERT (bh >= ah); 372 if (ah == bh) 373 break; 374 375 bh -= ah; 376 if (bh < (CNST_LIMB (1) << (GMP_LIMB_BITS / 2 + 1))) 377 break; 378 379 if (bh <= ah) 380 { 381 /* Use q = 1 */ 382 u00 += u01; 383 u10 += u11; 384 } 385 else 386 { 387 mp_limb_t r; 388 mp_limb_t q = div1 (&r, bh, ah); 389 bh = r; 390 if (bh < (CNST_LIMB(1) << (GMP_LIMB_BITS / 2 + 1))) 391 { 392 /* B is too small, but q is correct. */ 393 u00 += q * u01; 394 u10 += q * u11; 395 break; 396 } 397 q++; 398 u00 += q * u01; 399 u10 += q * u11; 400 } 401 } 402 403 done: 404 M->u[0][0] = u00; M->u[0][1] = u01; 405 M->u[1][0] = u10; M->u[1][1] = u11; 406 407 return 1; 408 } 409 410 /* Sets (r;b) = (a;b) M, with M = (u00, u01; u10, u11). Vector must 411 * have space for n + 1 limbs. Uses three buffers to avoid a copy*/ 412 mp_size_t 413 mpn_hgcd_mul_matrix1_vector (const struct hgcd_matrix1 *M, 414 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) 415 { 416 mp_limb_t ah, bh; 417 418 /* Compute (r,b) <-- (u00 a + u10 b, u01 a + u11 b) as 419 420 r = u00 * a 421 r += u10 * b 422 b *= u11 423 b += u01 * a 424 */ 425 426 #if HAVE_NATIVE_mpn_addaddmul_1msb0 427 ah = mpn_addaddmul_1msb0 (rp, ap, bp, n, M->u[0][0], M->u[1][0]); 428 bh = mpn_addaddmul_1msb0 (bp, bp, ap, n, M->u[1][1], M->u[0][1]); 429 #else 430 ah = mpn_mul_1 (rp, ap, n, M->u[0][0]); 431 ah += mpn_addmul_1 (rp, bp, n, M->u[1][0]); 432 433 bh = mpn_mul_1 (bp, bp, n, M->u[1][1]); 434 bh += mpn_addmul_1 (bp, ap, n, M->u[0][1]); 435 #endif 436 rp[n] = ah; 437 bp[n] = bh; 438 439 n += (ah | bh) > 0; 440 return n; 441 } 442 443 /* Sets (r;b) = M^{-1}(a;b), with M^{-1} = (u11, -u01; -u10, u00) from 444 the left. Uses three buffers, to avoid a copy. */ 445 mp_size_t 446 mpn_hgcd_mul_matrix1_inverse_vector (const struct hgcd_matrix1 *M, 447 mp_ptr rp, mp_srcptr ap, mp_ptr bp, mp_size_t n) 448 { 449 mp_limb_t h0, h1; 450 451 /* Compute (r;b) <-- (u11 a - u01 b; -u10 a + u00 b) as 452 453 r = u11 * a 454 r -= u01 * b 455 b *= u00 456 b -= u10 * a 457 */ 458 459 h0 = mpn_mul_1 (rp, ap, n, M->u[1][1]); 460 h1 = mpn_submul_1 (rp, bp, n, M->u[0][1]); 461 ASSERT (h0 == h1); 462 463 h0 = mpn_mul_1 (bp, bp, n, M->u[0][0]); 464 h1 = mpn_submul_1 (bp, ap, n, M->u[1][0]); 465 ASSERT (h0 == h1); 466 467 n -= (rp[n-1] | bp[n-1]) == 0; 468 return n; 469 } 470