1 /* mpn_mul -- Multiply two natural numbers. 2 3 Contributed to the GNU project by Torbjorn Granlund. 4 5 Copyright 1991, 1993, 1994, 1996, 1997, 1999, 2000, 2001, 2002, 2003, 2005, 6 2006, 2007, 2009, 2010 Free Software Foundation, Inc. 7 8 This file is part of the GNU MP Library. 9 10 The GNU MP Library is free software; you can redistribute it and/or modify 11 it under the terms of the GNU Lesser General Public License as published by 12 the Free Software Foundation; either version 3 of the License, or (at your 13 option) any later version. 14 15 The GNU MP Library is distributed in the hope that it will be useful, but 16 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 17 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 18 License for more details. 19 20 You should have received a copy of the GNU Lesser General Public License 21 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 22 23 #include "gmp.h" 24 #include "gmp-impl.h" 25 26 27 #ifndef MUL_BASECASE_MAX_UN 28 #define MUL_BASECASE_MAX_UN 500 29 #endif 30 31 #define TOOM33_OK(an,bn) (6 + 2 * an < 3 * bn) 32 #define TOOM44_OK(an,bn) (12 + 3 * an < 4 * bn) 33 34 /* Multiply the natural numbers u (pointed to by UP, with UN limbs) and v 35 (pointed to by VP, with VN limbs), and store the result at PRODP. The 36 result is UN + VN limbs. Return the most significant limb of the result. 37 38 NOTE: The space pointed to by PRODP is overwritten before finished with U 39 and V, so overlap is an error. 40 41 Argument constraints: 42 1. UN >= VN. 43 2. PRODP != UP and PRODP != VP, i.e. the destination must be distinct from 44 the multiplier and the multiplicand. */ 45 46 /* 47 * The cutoff lines in the toomX2 and toomX3 code are now exactly between the 48 ideal lines of the surrounding algorithms. Is that optimal? 49 50 * The toomX3 code now uses a structure similar to the one of toomX2, except 51 that it loops longer in the unbalanced case. The result is that the 52 remaining area might have un < vn. Should we fix the toomX2 code in a 53 similar way? 54 55 * The toomX3 code is used for the largest non-FFT unbalanced operands. It 56 therefore calls mpn_mul recursively for certain cases. 57 58 * Allocate static temp space using THRESHOLD variables (except for toom44 59 when !WANT_FFT). That way, we can typically have no TMP_ALLOC at all. 60 61 * We sort ToomX2 algorithms together, assuming the toom22, toom32, toom42 62 have the same vn threshold. This is not true, we should actually use 63 mul_basecase for slightly larger operands for toom32 than for toom22, and 64 even larger for toom42. 65 66 * That problem is even more prevalent for toomX3. We therefore use special 67 THRESHOLD variables there. 68 69 * Is our ITCH allocation correct? 70 */ 71 72 #define ITCH (16*vn + 100) 73 74 mp_limb_t 75 mpn_mul (mp_ptr prodp, 76 mp_srcptr up, mp_size_t un, 77 mp_srcptr vp, mp_size_t vn) 78 { 79 ASSERT (un >= vn); 80 ASSERT (vn >= 1); 81 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, up, un)); 82 ASSERT (! MPN_OVERLAP_P (prodp, un+vn, vp, vn)); 83 84 if (un == vn) 85 { 86 if (up == vp) 87 mpn_sqr (prodp, up, un); 88 else 89 mpn_mul_n (prodp, up, vp, un); 90 } 91 else if (vn < MUL_TOOM22_THRESHOLD) 92 { /* plain schoolbook multiplication */ 93 94 /* Unless un is very large, or else if have an applicable mpn_mul_N, 95 perform basecase multiply directly. */ 96 if (un <= MUL_BASECASE_MAX_UN 97 #if HAVE_NATIVE_mpn_mul_2 98 || vn <= 2 99 #else 100 || vn == 1 101 #endif 102 ) 103 mpn_mul_basecase (prodp, up, un, vp, vn); 104 else 105 { 106 /* We have un >> MUL_BASECASE_MAX_UN > vn. For better memory 107 locality, split up[] into MUL_BASECASE_MAX_UN pieces and multiply 108 these pieces with the vp[] operand. After each such partial 109 multiplication (but the last) we copy the most significant vn 110 limbs into a temporary buffer since that part would otherwise be 111 overwritten by the next multiplication. After the next 112 multiplication, we add it back. This illustrates the situation: 113 114 -->vn<-- 115 | |<------- un ------->| 116 _____________________| 117 X /| 118 /XX__________________/ | 119 _____________________ | 120 X / | 121 /XX__________________/ | 122 _____________________ | 123 / / | 124 /____________________/ | 125 ================================================================== 126 127 The parts marked with X are the parts whose sums are copied into 128 the temporary buffer. */ 129 130 mp_limb_t tp[MUL_TOOM22_THRESHOLD_LIMIT]; 131 mp_limb_t cy; 132 ASSERT (MUL_TOOM22_THRESHOLD <= MUL_TOOM22_THRESHOLD_LIMIT); 133 134 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); 135 prodp += MUL_BASECASE_MAX_UN; 136 MPN_COPY (tp, prodp, vn); /* preserve high triangle */ 137 up += MUL_BASECASE_MAX_UN; 138 un -= MUL_BASECASE_MAX_UN; 139 while (un > MUL_BASECASE_MAX_UN) 140 { 141 mpn_mul_basecase (prodp, up, MUL_BASECASE_MAX_UN, vp, vn); 142 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ 143 mpn_incr_u (prodp + vn, cy); 144 prodp += MUL_BASECASE_MAX_UN; 145 MPN_COPY (tp, prodp, vn); /* preserve high triangle */ 146 up += MUL_BASECASE_MAX_UN; 147 un -= MUL_BASECASE_MAX_UN; 148 } 149 if (un > vn) 150 { 151 mpn_mul_basecase (prodp, up, un, vp, vn); 152 } 153 else 154 { 155 ASSERT (un > 0); 156 mpn_mul_basecase (prodp, vp, vn, up, un); 157 } 158 cy = mpn_add_n (prodp, prodp, tp, vn); /* add back preserved triangle */ 159 mpn_incr_u (prodp + vn, cy); 160 } 161 } 162 else if (BELOW_THRESHOLD (vn, MUL_TOOM33_THRESHOLD)) 163 { 164 /* Use ToomX2 variants */ 165 mp_ptr scratch; 166 TMP_SDECL; TMP_SMARK; 167 168 scratch = TMP_SALLOC_LIMBS (ITCH); 169 170 /* FIXME: This condition (repeated in the loop below) leaves from a vn*vn 171 square to a (3vn-1)*vn rectangle. Leaving such a rectangle is hardly 172 wise; we would get better balance by slightly moving the bound. We 173 will sometimes end up with un < vn, like the the X3 arm below. */ 174 if (un >= 3 * vn) 175 { 176 mp_limb_t cy; 177 mp_ptr ws; 178 179 /* The maximum ws usage is for the mpn_mul result. */ 180 ws = TMP_SALLOC_LIMBS (4 * vn); 181 182 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); 183 un -= 2 * vn; 184 up += 2 * vn; 185 prodp += 2 * vn; 186 187 while (un >= 3 * vn) 188 { 189 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); 190 un -= 2 * vn; 191 up += 2 * vn; 192 cy = mpn_add_n (prodp, prodp, ws, vn); 193 MPN_COPY (prodp + vn, ws + vn, 2 * vn); 194 mpn_incr_u (prodp + vn, cy); 195 prodp += 2 * vn; 196 } 197 198 /* vn <= un < 3vn */ 199 200 if (4 * un < 5 * vn) 201 mpn_toom22_mul (ws, up, un, vp, vn, scratch); 202 else if (4 * un < 7 * vn) 203 mpn_toom32_mul (ws, up, un, vp, vn, scratch); 204 else 205 mpn_toom42_mul (ws, up, un, vp, vn, scratch); 206 207 cy = mpn_add_n (prodp, prodp, ws, vn); 208 MPN_COPY (prodp + vn, ws + vn, un); 209 mpn_incr_u (prodp + vn, cy); 210 } 211 else 212 { 213 if (4 * un < 5 * vn) 214 mpn_toom22_mul (prodp, up, un, vp, vn, scratch); 215 else if (4 * un < 7 * vn) 216 mpn_toom32_mul (prodp, up, un, vp, vn, scratch); 217 else 218 mpn_toom42_mul (prodp, up, un, vp, vn, scratch); 219 } 220 TMP_SFREE; 221 } 222 else if (BELOW_THRESHOLD ((un + vn) >> 1, MUL_FFT_THRESHOLD) || 223 BELOW_THRESHOLD (3 * vn, MUL_FFT_THRESHOLD)) 224 { 225 /* Handle the largest operands that are not in the FFT range. The 2nd 226 condition makes very unbalanced operands avoid the FFT code (except 227 perhaps as coefficient products of the Toom code. */ 228 229 if (BELOW_THRESHOLD (vn, MUL_TOOM44_THRESHOLD) || !TOOM44_OK (un, vn)) 230 { 231 /* Use ToomX3 variants */ 232 mp_ptr scratch; 233 TMP_SDECL; TMP_SMARK; 234 235 scratch = TMP_SALLOC_LIMBS (ITCH); 236 237 if (2 * un >= 5 * vn) 238 { 239 mp_limb_t cy; 240 mp_ptr ws; 241 242 /* The maximum ws usage is for the mpn_mul result. */ 243 ws = TMP_SALLOC_LIMBS (7 * vn >> 1); 244 245 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) 246 mpn_toom42_mul (prodp, up, 2 * vn, vp, vn, scratch); 247 else 248 mpn_toom63_mul (prodp, up, 2 * vn, vp, vn, scratch); 249 un -= 2 * vn; 250 up += 2 * vn; 251 prodp += 2 * vn; 252 253 while (2 * un >= 5 * vn) /* un >= 2.5vn */ 254 { 255 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) 256 mpn_toom42_mul (ws, up, 2 * vn, vp, vn, scratch); 257 else 258 mpn_toom63_mul (ws, up, 2 * vn, vp, vn, scratch); 259 un -= 2 * vn; 260 up += 2 * vn; 261 cy = mpn_add_n (prodp, prodp, ws, vn); 262 MPN_COPY (prodp + vn, ws + vn, 2 * vn); 263 mpn_incr_u (prodp + vn, cy); 264 prodp += 2 * vn; 265 } 266 267 /* vn / 2 <= un < 2.5vn */ 268 269 if (un < vn) 270 mpn_mul (ws, vp, vn, up, un); 271 else 272 mpn_mul (ws, up, un, vp, vn); 273 274 cy = mpn_add_n (prodp, prodp, ws, vn); 275 MPN_COPY (prodp + vn, ws + vn, un); 276 mpn_incr_u (prodp + vn, cy); 277 } 278 else 279 { 280 if (6 * un < 7 * vn) 281 mpn_toom33_mul (prodp, up, un, vp, vn, scratch); 282 else if (2 * un < 3 * vn) 283 { 284 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM43_THRESHOLD)) 285 mpn_toom32_mul (prodp, up, un, vp, vn, scratch); 286 else 287 mpn_toom43_mul (prodp, up, un, vp, vn, scratch); 288 } 289 else if (6 * un < 11 * vn) 290 { 291 if (4 * un < 7 * vn) 292 { 293 if (BELOW_THRESHOLD (vn, MUL_TOOM32_TO_TOOM53_THRESHOLD)) 294 mpn_toom32_mul (prodp, up, un, vp, vn, scratch); 295 else 296 mpn_toom53_mul (prodp, up, un, vp, vn, scratch); 297 } 298 else 299 { 300 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM53_THRESHOLD)) 301 mpn_toom42_mul (prodp, up, un, vp, vn, scratch); 302 else 303 mpn_toom53_mul (prodp, up, un, vp, vn, scratch); 304 } 305 } 306 else 307 { 308 if (BELOW_THRESHOLD (vn, MUL_TOOM42_TO_TOOM63_THRESHOLD)) 309 mpn_toom42_mul (prodp, up, un, vp, vn, scratch); 310 else 311 mpn_toom63_mul (prodp, up, un, vp, vn, scratch); 312 } 313 } 314 TMP_SFREE; 315 } 316 else 317 { 318 mp_ptr scratch; 319 TMP_DECL; TMP_MARK; 320 321 if (BELOW_THRESHOLD (vn, MUL_TOOM6H_THRESHOLD)) 322 { 323 scratch = TMP_ALLOC_LIMBS (mpn_toom44_mul_itch (un, vn)); 324 mpn_toom44_mul (prodp, up, un, vp, vn, scratch); 325 } 326 else if (BELOW_THRESHOLD (vn, MUL_TOOM8H_THRESHOLD)) 327 { 328 scratch = TMP_ALLOC_LIMBS (mpn_toom6h_mul_itch (un, vn)); 329 mpn_toom6h_mul (prodp, up, un, vp, vn, scratch); 330 } 331 else 332 { 333 scratch = TMP_ALLOC_LIMBS (mpn_toom8h_mul_itch (un, vn)); 334 mpn_toom8h_mul (prodp, up, un, vp, vn, scratch); 335 } 336 TMP_FREE; 337 } 338 } 339 else 340 { 341 if (un >= 8 * vn) 342 { 343 mp_limb_t cy; 344 mp_ptr ws; 345 TMP_DECL; TMP_MARK; 346 347 /* The maximum ws usage is for the mpn_mul result. */ 348 ws = TMP_BALLOC_LIMBS (9 * vn >> 1); 349 350 mpn_fft_mul (prodp, up, 3 * vn, vp, vn); 351 un -= 3 * vn; 352 up += 3 * vn; 353 prodp += 3 * vn; 354 355 while (2 * un >= 7 * vn) /* un >= 3.5vn */ 356 { 357 mpn_fft_mul (ws, up, 3 * vn, vp, vn); 358 un -= 3 * vn; 359 up += 3 * vn; 360 cy = mpn_add_n (prodp, prodp, ws, vn); 361 MPN_COPY (prodp + vn, ws + vn, 3 * vn); 362 mpn_incr_u (prodp + vn, cy); 363 prodp += 3 * vn; 364 } 365 366 /* vn / 2 <= un < 3.5vn */ 367 368 if (un < vn) 369 mpn_mul (ws, vp, vn, up, un); 370 else 371 mpn_mul (ws, up, un, vp, vn); 372 373 cy = mpn_add_n (prodp, prodp, ws, vn); 374 MPN_COPY (prodp + vn, ws + vn, un); 375 mpn_incr_u (prodp + vn, cy); 376 377 TMP_FREE; 378 } 379 else 380 mpn_fft_mul (prodp, up, un, vp, vn); 381 } 382 383 return prodp[un + vn - 1]; /* historic */ 384 } 385