1 /* Interpolaton for the algorithm Toom-Cook 6.5-way. 2 3 Contributed to the GNU project by Marco Bodrato. 4 5 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 6 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 7 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 8 9 Copyright 2009, 2010 Free Software Foundation, Inc. 10 11 This file is part of the GNU MP Library. 12 13 The GNU MP Library is free software; you can redistribute it and/or modify 14 it under the terms of the GNU Lesser General Public License as published by 15 the Free Software Foundation; either version 3 of the License, or (at your 16 option) any later version. 17 18 The GNU MP Library is distributed in the hope that it will be useful, but 19 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 20 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 21 License for more details. 22 23 You should have received a copy of the GNU Lesser General Public License 24 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 25 26 27 #include "gmp.h" 28 #include "gmp-impl.h" 29 30 31 #if HAVE_NATIVE_mpn_sublsh_n 32 #define DO_mpn_sublsh_n(dst,src,n,s,ws) mpn_sublsh_n(dst,dst,src,n,s) 33 #else 34 static mp_limb_t 35 DO_mpn_sublsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 36 { 37 #if USE_MUL_1 && 0 38 return mpn_submul_1(dst,src,n,CNST_LIMB(1) <<(s)); 39 #else 40 mp_limb_t __cy; 41 __cy = mpn_lshift(ws,src,n,s); 42 return __cy + mpn_sub_n(dst,dst,ws,n); 43 #endif 44 } 45 #endif 46 47 #if HAVE_NATIVE_mpn_addlsh_n 48 #define DO_mpn_addlsh_n(dst,src,n,s,ws) mpn_addlsh_n(dst,dst,src,n,s) 49 #else 50 static mp_limb_t 51 DO_mpn_addlsh_n(mp_ptr dst, mp_srcptr src, mp_size_t n, unsigned int s, mp_ptr ws) 52 { 53 #if USE_MUL_1 && 0 54 return mpn_addmul_1(dst,src,n,CNST_LIMB(1) <<(s)); 55 #else 56 mp_limb_t __cy; 57 __cy = mpn_lshift(ws,src,n,s); 58 return __cy + mpn_add_n(dst,dst,ws,n); 59 #endif 60 } 61 #endif 62 63 #if HAVE_NATIVE_mpn_subrsh 64 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) mpn_subrsh(dst,nd,src,ns,s) 65 #else 66 /* FIXME: This is not a correct definition, it assumes no carry */ 67 #define DO_mpn_subrsh(dst,nd,src,ns,s,ws) \ 68 do { \ 69 mp_limb_t __cy; \ 70 MPN_DECR_U (dst, nd, src[0] >> s); \ 71 __cy = DO_mpn_sublsh_n (dst, src + 1, ns - 1, GMP_NUMB_BITS - s, ws); \ 72 MPN_DECR_U (dst + ns - 1, nd - ns + 1, __cy); \ 73 } while (0) 74 #endif 75 76 77 #if GMP_NUMB_BITS < 21 78 #error Not implemented: Both sublsh_n(,,,20) should be corrected. 79 #endif 80 81 #if GMP_NUMB_BITS < 16 82 #error Not implemented: divexact_by42525 needs splitting. 83 #endif 84 85 #if GMP_NUMB_BITS < 12 86 #error Not implemented: Hard to adapt... 87 #endif 88 89 /* FIXME: tuneup should decide the best variant */ 90 #ifndef AORSMUL_FASTER_AORS_AORSLSH 91 #define AORSMUL_FASTER_AORS_AORSLSH 1 92 #endif 93 #ifndef AORSMUL_FASTER_AORS_2AORSLSH 94 #define AORSMUL_FASTER_AORS_2AORSLSH 1 95 #endif 96 #ifndef AORSMUL_FASTER_2AORSLSH 97 #define AORSMUL_FASTER_2AORSLSH 1 98 #endif 99 #ifndef AORSMUL_FASTER_3AORSLSH 100 #define AORSMUL_FASTER_3AORSLSH 1 101 #endif 102 103 #define BINVERT_9 \ 104 ((((GMP_NUMB_MAX / 9) << (6 - GMP_NUMB_BITS % 6)) * 8 & GMP_NUMB_MAX) | 0x39) 105 106 #define BINVERT_255 \ 107 (GMP_NUMB_MAX - ((GMP_NUMB_MAX / 255) << (8 - GMP_NUMB_BITS % 8))) 108 109 /* FIXME: find some more general expressions for 2835^-1, 42525^-1 */ 110 #if GMP_LIMB_BITS == 32 111 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x53E3771B)) 112 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0x9F314C35)) 113 #else 114 #if GMP_LIMB_BITS == 64 115 #define BINVERT_2835 (GMP_NUMB_MASK & CNST_LIMB(0x938CC70553E3771B)) 116 #define BINVERT_42525 (GMP_NUMB_MASK & CNST_LIMB(0xE7B40D449F314C35)) 117 #endif 118 #endif 119 120 #ifndef mpn_divexact_by255 121 #if GMP_NUMB_BITS % 8 == 0 122 #define mpn_divexact_by255(dst,src,size) \ 123 (255 & 1 * mpn_bdiv_dbm1 (dst, src, size, __GMP_CAST (mp_limb_t, GMP_NUMB_MASK / 255))) 124 #else 125 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 126 #define mpn_divexact_by255(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(255),BINVERT_255,0) 127 #else 128 #define mpn_divexact_by255(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(255)) 129 #endif 130 #endif 131 #endif 132 133 #ifndef mpn_divexact_by9x4 134 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 135 #define mpn_divexact_by9x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(9),BINVERT_9,2) 136 #else 137 #define mpn_divexact_by9x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(9)<<2) 138 #endif 139 #endif 140 141 #ifndef mpn_divexact_by42525 142 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_42525) 143 #define mpn_divexact_by42525(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(42525),BINVERT_42525,0) 144 #else 145 #define mpn_divexact_by42525(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(42525)) 146 #endif 147 #endif 148 149 #ifndef mpn_divexact_by2835x4 150 #if HAVE_NATIVE_mpn_pi1_bdiv_q_1 && defined(BINVERT_2835) 151 #define mpn_divexact_by2835x4(dst,src,size) mpn_pi1_bdiv_q_1(dst,src,size,CNST_LIMB(2835),BINVERT_2835,2) 152 #else 153 #define mpn_divexact_by2835x4(dst,src,size) mpn_divexact_1(dst,src,size,CNST_LIMB(2835)<<2) 154 #endif 155 #endif 156 157 /* Interpolation for Toom-6.5 (or Toom-6), using the evaluation 158 points: infinity(6.5 only), +-4, +-2, +-1, +-1/4, +-1/2, 0. More precisely, 159 we want to compute f(2^(GMP_NUMB_BITS * n)) for a polynomial f of 160 degree 11 (or 10), given the 12 (rsp. 11) values: 161 162 r0 = limit at infinity of f(x) / x^7, 163 r1 = f(4),f(-4), 164 r2 = f(2),f(-2), 165 r3 = f(1),f(-1), 166 r4 = f(1/4),f(-1/4), 167 r5 = f(1/2),f(-1/2), 168 r6 = f(0). 169 170 All couples of the form f(n),f(-n) must be already mixed with 171 toom_couple_handling(f(n),...,f(-n),...) 172 173 The result is stored in {pp, spt + 7*n (or 6*n)}. 174 At entry, r6 is stored at {pp, 2n}, 175 r4 is stored at {pp + 3n, 3n + 1}. 176 r2 is stored at {pp + 7n, 3n + 1}. 177 r0 is stored at {pp +11n, spt}. 178 179 The other values are 3n+1 limbs each (with most significant limbs small). 180 181 Negative intermediate results are stored two-complemented. 182 Inputs are destroyed. 183 */ 184 185 void 186 mpn_toom_interpolate_12pts (mp_ptr pp, mp_ptr r1, mp_ptr r3, mp_ptr r5, 187 mp_size_t n, mp_size_t spt, int half, mp_ptr wsi) 188 { 189 mp_limb_t cy; 190 mp_size_t n3; 191 mp_size_t n3p1; 192 n3 = 3 * n; 193 n3p1 = n3 + 1; 194 195 #define r4 (pp + n3) /* 3n+1 */ 196 #define r2 (pp + 7 * n) /* 3n+1 */ 197 #define r0 (pp +11 * n) /* s+t <= 2*n */ 198 199 /******************************* interpolation *****************************/ 200 if (half != 0) { 201 cy = mpn_sub_n (r3, r3, r0, spt); 202 MPN_DECR_U (r3 + spt, n3p1 - spt, cy); 203 204 cy = DO_mpn_sublsh_n (r2, r0, spt, 10, wsi); 205 MPN_DECR_U (r2 + spt, n3p1 - spt, cy); 206 DO_mpn_subrsh(r5, n3p1, r0, spt, 2, wsi); 207 208 cy = DO_mpn_sublsh_n (r1, r0, spt, 20, wsi); 209 MPN_DECR_U (r1 + spt, n3p1 - spt, cy); 210 DO_mpn_subrsh(r4, n3p1, r0, spt, 4, wsi); 211 }; 212 213 r4[n3] -= DO_mpn_sublsh_n (r4 + n, pp, 2 * n, 20, wsi); 214 DO_mpn_subrsh(r1 + n, 2 * n + 1, pp, 2 * n, 4, wsi); 215 216 #if HAVE_NATIVE_mpn_add_n_sub_n 217 mpn_add_n_sub_n (r1, r4, r4, r1, n3p1); 218 #else 219 ASSERT_NOCARRY(mpn_add_n (wsi, r1, r4, n3p1)); 220 mpn_sub_n (r4, r4, r1, n3p1); /* can be negative */ 221 MP_PTR_SWAP(r1, wsi); 222 #endif 223 224 r5[n3] -= DO_mpn_sublsh_n (r5 + n, pp, 2 * n, 10, wsi); 225 DO_mpn_subrsh(r2 + n, 2 * n + 1, pp, 2 * n, 2, wsi); 226 227 #if HAVE_NATIVE_mpn_add_n_sub_n 228 mpn_add_n_sub_n (r2, r5, r5, r2, n3p1); 229 #else 230 mpn_sub_n (wsi, r5, r2, n3p1); /* can be negative */ 231 ASSERT_NOCARRY(mpn_add_n (r2, r2, r5, n3p1)); 232 MP_PTR_SWAP(r5, wsi); 233 #endif 234 235 r3[n3] -= mpn_sub_n (r3+n, r3+n, pp, 2 * n); 236 237 #if AORSMUL_FASTER_AORS_AORSLSH 238 mpn_submul_1 (r4, r5, n3p1, 257); /* can be negative */ 239 #else 240 mpn_sub_n (r4, r4, r5, n3p1); /* can be negative */ 241 DO_mpn_sublsh_n (r4, r5, n3p1, 8, wsi); /* can be negative */ 242 #endif 243 /* A division by 2835x4 followsi. Warning: the operand can be negative! */ 244 mpn_divexact_by2835x4(r4, r4, n3p1); 245 if ((r4[n3] & (GMP_NUMB_MAX << (GMP_NUMB_BITS-3))) != 0) 246 r4[n3] |= (GMP_NUMB_MAX << (GMP_NUMB_BITS-2)); 247 248 #if AORSMUL_FASTER_2AORSLSH 249 mpn_addmul_1 (r5, r4, n3p1, 60); /* can be negative */ 250 #else 251 DO_mpn_sublsh_n (r5, r4, n3p1, 2, wsi); /* can be negative */ 252 DO_mpn_addlsh_n (r5, r4, n3p1, 6, wsi); /* can give a carry */ 253 #endif 254 mpn_divexact_by255(r5, r5, n3p1); 255 256 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r3, n3p1, 5, wsi)); 257 258 #if AORSMUL_FASTER_3AORSLSH 259 ASSERT_NOCARRY(mpn_submul_1 (r1, r2, n3p1, 100)); 260 #else 261 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 6, wsi)); 262 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 5, wsi)); 263 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r2, n3p1, 2, wsi)); 264 #endif 265 ASSERT_NOCARRY(DO_mpn_sublsh_n (r1, r3, n3p1, 9, wsi)); 266 mpn_divexact_by42525(r1, r1, n3p1); 267 268 #if AORSMUL_FASTER_AORS_2AORSLSH 269 ASSERT_NOCARRY(mpn_submul_1 (r2, r1, n3p1, 225)); 270 #else 271 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r1, n3p1)); 272 ASSERT_NOCARRY(DO_mpn_addlsh_n (r2, r1, n3p1, 5, wsi)); 273 ASSERT_NOCARRY(DO_mpn_sublsh_n (r2, r1, n3p1, 8, wsi)); 274 #endif 275 mpn_divexact_by9x4(r2, r2, n3p1); 276 277 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r2, n3p1)); 278 279 mpn_sub_n (r4, r2, r4, n3p1); 280 ASSERT_NOCARRY(mpn_rshift(r4, r4, n3p1, 1)); 281 ASSERT_NOCARRY(mpn_sub_n (r2, r2, r4, n3p1)); 282 283 mpn_add_n (r5, r5, r1, n3p1); 284 ASSERT_NOCARRY(mpn_rshift(r5, r5, n3p1, 1)); 285 286 /* last interpolation steps... */ 287 ASSERT_NOCARRY(mpn_sub_n (r3, r3, r1, n3p1)); 288 ASSERT_NOCARRY(mpn_sub_n (r1, r1, r5, n3p1)); 289 /* ... could be mixed with recomposition 290 ||H-r5|M-r5|L-r5| ||H-r1|M-r1|L-r1| 291 */ 292 293 /***************************** recomposition *******************************/ 294 /* 295 pp[] prior to operations: 296 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 297 298 summation scheme for remaining operations: 299 |__12|n_11|n_10|n__9|n__8|n__7|n__6|n__5|n__4|n__3|n__2|n___|n___|pp 300 |M r0|L r0|___||H r2|M r2|L r2|___||H r4|M r4|L r4|____|H_r6|L r6|pp 301 ||H r1|M r1|L r1| ||H r3|M r3|L r3| ||H_r5|M_r5|L_r5| 302 */ 303 304 cy = mpn_add_n (pp + n, pp + n, r5, n); 305 cy = mpn_add_1 (pp + 2 * n, r5 + n, n, cy); 306 #if HAVE_NATIVE_mpn_add_nc 307 cy = r5[n3] + mpn_add_nc(pp + n3, pp + n3, r5 + 2 * n, n, cy); 308 #else 309 MPN_INCR_U (r5 + 2 * n, n + 1, cy); 310 cy = r5[n3] + mpn_add_n (pp + n3, pp + n3, r5 + 2 * n, n); 311 #endif 312 MPN_INCR_U (pp + n3 + n, 2 * n + 1, cy); 313 314 pp[2 * n3]+= mpn_add_n (pp + 5 * n, pp + 5 * n, r3, n); 315 cy = mpn_add_1 (pp + 2 * n3, r3 + n, n, pp[2 * n3]); 316 #if HAVE_NATIVE_mpn_add_nc 317 cy = r3[n3] + mpn_add_nc(pp + 7 * n, pp + 7 * n, r3 + 2 * n, n, cy); 318 #else 319 MPN_INCR_U (r3 + 2 * n, n + 1, cy); 320 cy = r3[n3] + mpn_add_n (pp + 7 * n, pp + 7 * n, r3 + 2 * n, n); 321 #endif 322 MPN_INCR_U (pp + 8 * n, 2 * n + 1, cy); 323 324 pp[10*n]+=mpn_add_n (pp + 9 * n, pp + 9 * n, r1, n); 325 if (half) { 326 cy = mpn_add_1 (pp + 10 * n, r1 + n, n, pp[10 * n]); 327 #if HAVE_NATIVE_mpn_add_nc 328 if (LIKELY (spt > n)) { 329 cy = r1[n3] + mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, n, cy); 330 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 331 } else { 332 ASSERT_NOCARRY(mpn_add_nc(pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt, cy)); 333 } 334 #else 335 MPN_INCR_U (r1 + 2 * n, n + 1, cy); 336 if (LIKELY (spt > n)) { 337 cy = r1[n3] + mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, n); 338 MPN_INCR_U (pp + 4 * n3, spt - n, cy); 339 } else { 340 ASSERT_NOCARRY(mpn_add_n (pp + 11 * n, pp + 11 * n, r1 + 2 * n, spt)); 341 } 342 #endif 343 } else { 344 ASSERT_NOCARRY(mpn_add_1 (pp + 10 * n, r1 + n, spt, pp[10 * n])); 345 } 346 347 #undef r0 348 #undef r2 349 #undef r4 350 } 351