1 /* mpn_perfect_square_p(u,usize) -- Return non-zero if U is a perfect square, 2 zero otherwise. 3 4 Copyright 1991, 1993, 1994, 1996, 1997, 2000, 2001, 2002, 2005 Free Software 5 Foundation, Inc. 6 7 This file is part of the GNU MP Library. 8 9 The GNU MP Library is free software; you can redistribute it and/or modify 10 it under the terms of the GNU Lesser General Public License as published by 11 the Free Software Foundation; either version 3 of the License, or (at your 12 option) any later version. 13 14 The GNU MP Library is distributed in the hope that it will be useful, but 15 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 16 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 17 License for more details. 18 19 You should have received a copy of the GNU Lesser General Public License 20 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 21 22 #include <stdio.h> /* for NULL */ 23 #include "gmp.h" 24 #include "gmp-impl.h" 25 #include "longlong.h" 26 27 #include "perfsqr.h" 28 29 30 /* change this to "#define TRACE(x) x" for diagnostics */ 31 #define TRACE(x) 32 33 34 35 /* PERFSQR_MOD_* detects non-squares using residue tests. 36 37 A macro PERFSQR_MOD_TEST is setup by gen-psqr.c in perfsqr.h. It takes 38 {up,usize} modulo a selected modulus to get a remainder r. For 32-bit or 39 64-bit limbs this modulus will be 2^24-1 or 2^48-1 using PERFSQR_MOD_34, 40 or for other limb or nail sizes a PERFSQR_PP is chosen and PERFSQR_MOD_PP 41 used. PERFSQR_PP_NORM and PERFSQR_PP_INVERTED are pre-calculated in this 42 case too. 43 44 PERFSQR_MOD_TEST then makes various calls to PERFSQR_MOD_1 or 45 PERFSQR_MOD_2 with divisors d which are factors of the modulus, and table 46 data indicating residues and non-residues modulo those divisors. The 47 table data is in 1 or 2 limbs worth of bits respectively, per the size of 48 each d. 49 50 A "modexact" style remainder is taken to reduce r modulo d. 51 PERFSQR_MOD_IDX implements this, producing an index "idx" for use with 52 the table data. Notice there's just one multiplication by a constant 53 "inv", for each d. 54 55 The modexact doesn't produce a true r%d remainder, instead idx satisfies 56 "-(idx<<PERFSQR_MOD_BITS) == r mod d". Because d is odd, this factor 57 -2^PERFSQR_MOD_BITS is a one-to-one mapping between r and idx, and is 58 accounted for by having the table data suitably permuted. 59 60 The remainder r fits within PERFSQR_MOD_BITS which is less than a limb. 61 In fact the GMP_LIMB_BITS - PERFSQR_MOD_BITS spare bits are enough to fit 62 each divisor d meaning the modexact multiply can take place entirely 63 within one limb, giving the compiler the chance to optimize it, in a way 64 that say umul_ppmm would not give. 65 66 There's no need for the divisors d to be prime, in fact gen-psqr.c makes 67 a deliberate effort to combine factors so as to reduce the number of 68 separate tests done on r. But such combining is limited to d <= 69 2*GMP_LIMB_BITS so that the table data fits in at most 2 limbs. 70 71 Alternatives: 72 73 It'd be possible to use bigger divisors d, and more than 2 limbs of table 74 data, but this doesn't look like it would be of much help to the prime 75 factors in the usual moduli 2^24-1 or 2^48-1. 76 77 The moduli 2^24-1 or 2^48-1 are nothing particularly special, they're 78 just easy to calculate (see mpn_mod_34lsub1) and have a nice set of prime 79 factors. 2^32-1 and 2^64-1 would be equally easy to calculate, but have 80 fewer prime factors. 81 82 The nails case usually ends up using mpn_mod_1, which is a lot slower 83 than mpn_mod_34lsub1. Perhaps other such special moduli could be found 84 for the nails case. Two-term things like 2^30-2^15-1 might be 85 candidates. Or at worst some on-the-fly de-nailing would allow the plain 86 2^24-1 to be used. Currently nails are too preliminary to be worried 87 about. 88 89 */ 90 91 #define PERFSQR_MOD_MASK ((CNST_LIMB(1) << PERFSQR_MOD_BITS) - 1) 92 93 #define MOD34_BITS (GMP_NUMB_BITS / 4 * 3) 94 #define MOD34_MASK ((CNST_LIMB(1) << MOD34_BITS) - 1) 95 96 #define PERFSQR_MOD_34(r, up, usize) \ 97 do { \ 98 (r) = mpn_mod_34lsub1 (up, usize); \ 99 (r) = ((r) & MOD34_MASK) + ((r) >> MOD34_BITS); \ 100 } while (0) 101 102 /* FIXME: The %= here isn't good, and might destroy any savings from keeping 103 the PERFSQR_MOD_IDX stuff within a limb (rather than needing umul_ppmm). 104 Maybe a new sort of mpn_preinv_mod_1 could accept an unnormalized divisor 105 and a shift count, like mpn_preinv_divrem_1. But mod_34lsub1 is our 106 normal case, so lets not worry too much about mod_1. */ 107 #define PERFSQR_MOD_PP(r, up, usize) \ 108 do { \ 109 if (USE_PREINV_MOD_1) \ 110 { \ 111 (r) = mpn_preinv_mod_1 (up, usize, PERFSQR_PP_NORM, \ 112 PERFSQR_PP_INVERTED); \ 113 (r) %= PERFSQR_PP; \ 114 } \ 115 else \ 116 { \ 117 (r) = mpn_mod_1 (up, usize, PERFSQR_PP); \ 118 } \ 119 } while (0) 120 121 #define PERFSQR_MOD_IDX(idx, r, d, inv) \ 122 do { \ 123 mp_limb_t q; \ 124 ASSERT ((r) <= PERFSQR_MOD_MASK); \ 125 ASSERT ((((inv) * (d)) & PERFSQR_MOD_MASK) == 1); \ 126 ASSERT (MP_LIMB_T_MAX / (d) >= PERFSQR_MOD_MASK); \ 127 \ 128 q = ((r) * (inv)) & PERFSQR_MOD_MASK; \ 129 ASSERT (r == ((q * (d)) & PERFSQR_MOD_MASK)); \ 130 (idx) = (q * (d)) >> PERFSQR_MOD_BITS; \ 131 } while (0) 132 133 #define PERFSQR_MOD_1(r, d, inv, mask) \ 134 do { \ 135 unsigned idx; \ 136 ASSERT ((d) <= GMP_LIMB_BITS); \ 137 PERFSQR_MOD_IDX(idx, r, d, inv); \ 138 TRACE (printf (" PERFSQR_MOD_1 d=%u r=%lu idx=%u\n", \ 139 d, r%d, idx)); \ 140 if ((((mask) >> idx) & 1) == 0) \ 141 { \ 142 TRACE (printf (" non-square\n")); \ 143 return 0; \ 144 } \ 145 } while (0) 146 147 /* The expression "(int) idx - GMP_LIMB_BITS < 0" lets the compiler use the 148 sign bit from "idx-GMP_LIMB_BITS", which might help avoid a branch. */ 149 #define PERFSQR_MOD_2(r, d, inv, mhi, mlo) \ 150 do { \ 151 mp_limb_t m; \ 152 unsigned idx; \ 153 ASSERT ((d) <= 2*GMP_LIMB_BITS); \ 154 \ 155 PERFSQR_MOD_IDX (idx, r, d, inv); \ 156 TRACE (printf (" PERFSQR_MOD_2 d=%u r=%lu idx=%u\n", \ 157 d, r%d, idx)); \ 158 m = ((int) idx - GMP_LIMB_BITS < 0 ? (mlo) : (mhi)); \ 159 idx %= GMP_LIMB_BITS; \ 160 if (((m >> idx) & 1) == 0) \ 161 { \ 162 TRACE (printf (" non-square\n")); \ 163 return 0; \ 164 } \ 165 } while (0) 166 167 168 int 169 mpn_perfect_square_p (mp_srcptr up, mp_size_t usize) 170 { 171 ASSERT (usize >= 1); 172 173 TRACE (gmp_printf ("mpn_perfect_square_p %Nd\n", up, usize)); 174 175 /* The first test excludes 212/256 (82.8%) of the perfect square candidates 176 in O(1) time. */ 177 { 178 unsigned idx = up[0] % 0x100; 179 if (((sq_res_0x100[idx / GMP_LIMB_BITS] 180 >> (idx % GMP_LIMB_BITS)) & 1) == 0) 181 return 0; 182 } 183 184 #if 0 185 /* Check that we have even multiplicity of 2, and then check that the rest is 186 a possible perfect square. Leave disabled until we can determine this 187 really is an improvement. It it is, it could completely replace the 188 simple probe above, since this should through out more non-squares, but at 189 the expense of somewhat more cycles. */ 190 { 191 mp_limb_t lo; 192 int cnt; 193 lo = up[0]; 194 while (lo == 0) 195 up++, lo = up[0], usize--; 196 count_trailing_zeros (cnt, lo); 197 if ((cnt & 1) != 0) 198 return 0; /* return of not even multiplicity of 2 */ 199 lo >>= cnt; /* shift down to align lowest non-zero bit */ 200 lo >>= 1; /* shift away lowest non-zero bit */ 201 if ((lo & 3) != 0) 202 return 0; 203 } 204 #endif 205 206 207 /* The second test uses mpn_mod_34lsub1 or mpn_mod_1 to detect non-squares 208 according to their residues modulo small primes (or powers of 209 primes). See perfsqr.h. */ 210 PERFSQR_MOD_TEST (up, usize); 211 212 213 /* For the third and last test, we finally compute the square root, 214 to make sure we've really got a perfect square. */ 215 { 216 mp_ptr root_ptr; 217 int res; 218 TMP_DECL; 219 220 TMP_MARK; 221 root_ptr = (mp_ptr) TMP_ALLOC ((usize + 1) / 2 * BYTES_PER_MP_LIMB); 222 223 /* Iff mpn_sqrtrem returns zero, the square is perfect. */ 224 res = ! mpn_sqrtrem (root_ptr, NULL, up, usize); 225 TMP_FREE; 226 227 return res; 228 } 229 } 230