1 /* mpn_toom62_mul -- Multiply {ap,an} and {bp,bn} where an is nominally 3 times 2 as large as bn. Or more accurately, (5/2)bn < an < 6bn. 3 4 Contributed to the GNU project by Torbjorn Granlund and Marco Bodrato. 5 6 The idea of applying toom to unbalanced multiplication is due to by Marco 7 Bodrato and Alberto Zanoni. 8 9 THE FUNCTION IN THIS FILE IS INTERNAL WITH A MUTABLE INTERFACE. IT IS ONLY 10 SAFE TO REACH IT THROUGH DOCUMENTED INTERFACES. IN FACT, IT IS ALMOST 11 GUARANTEED THAT IT WILL CHANGE OR DISAPPEAR IN A FUTURE GNU MP RELEASE. 12 13 Copyright 2006, 2007, 2008 Free Software Foundation, Inc. 14 15 This file is part of the GNU MP Library. 16 17 The GNU MP Library is free software; you can redistribute it and/or modify 18 it under the terms of the GNU Lesser General Public License as published by 19 the Free Software Foundation; either version 3 of the License, or (at your 20 option) any later version. 21 22 The GNU MP Library is distributed in the hope that it will be useful, but 23 WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY 24 or FITNESS FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public 25 License for more details. 26 27 You should have received a copy of the GNU Lesser General Public License 28 along with the GNU MP Library. If not, see http://www.gnu.org/licenses/. */ 29 30 31 /* 32 Things to work on: 33 34 1. Trim allocation. The allocations for as1, asm1, bs1, and bsm1 could be 35 avoided by instead reusing the pp area and the scratch allocation. 36 */ 37 38 #include "gmp.h" 39 #include "gmp-impl.h" 40 41 42 /* Evaluate in: -1, -1/2, 0, +1/2, +1, +2, +inf 43 44 <-s-><--n--><--n--><--n--> 45 ___ ______ ______ ______ ______ ______ 46 |a5_|___a4_|___a3_|___a2_|___a1_|___a0_| 47 |_b1_|___b0_| 48 <-t--><--n--> 49 50 v0 = a0 * b0 # A(0)*B(0) 51 v1 = ( a0+ a1+ a2+ a3+ a4+ a5)*( b0+ b1) # A(1)*B(1) ah <= 5 bh <= 1 52 vm1 = ( a0- a1+ a2- a3+ a4- a5)*( b0- b1) # A(-1)*B(-1) |ah| <= 2 bh = 0 53 v2 = ( a0+ 2a1+4a2+8a3+16a4+32a5)*( b0+2b1) # A(2)*B(2) ah <= 62 bh <= 2 54 vh = (32a0+16a1+8a2+4a3+ 2a4+ a5)*(2b0+ b1) # A(1/2)*B(1/2) ah <= 62 bh <= 2 55 vmh = (32a0-16a1+8a2-4a3+ 2a4- a5)*(2b0- b1) # A(-1/2)*B(-1/2) -20<=ah<=41 0<=bh<=1 56 vinf= a5 * b1 # A(inf)*B(inf) 57 */ 58 59 void 60 mpn_toom62_mul (mp_ptr pp, 61 mp_srcptr ap, mp_size_t an, 62 mp_srcptr bp, mp_size_t bn, 63 mp_ptr scratch) 64 { 65 mp_size_t n, s, t; 66 int vm1_neg, vmh_neg, bsm_neg; 67 mp_limb_t cy; 68 mp_ptr a0_a2, a1_a3; 69 mp_ptr as1, asm1, as2, ash, asmh; 70 mp_ptr bs1, bsm1, bs2, bsh, bsmh; 71 enum toom4_flags flags; 72 TMP_DECL; 73 74 #define a0 ap 75 #define a1 (ap + n) 76 #define a2 (ap + 2*n) 77 #define a3 (ap + 3*n) 78 #define a4 (ap + 4*n) 79 #define a5 (ap + 5*n) 80 #define b0 bp 81 #define b1 (bp + n) 82 83 n = 1 + (an >= 3 * bn ? (an - 1) / (unsigned long) 6 : (bn - 1) >> 1); 84 85 s = an - 5 * n; 86 t = bn - n; 87 88 ASSERT (0 < s && s <= n); 89 ASSERT (0 < t && t <= n); 90 91 TMP_MARK; 92 93 as1 = TMP_SALLOC_LIMBS (n + 1); 94 asm1 = TMP_SALLOC_LIMBS (n + 1); 95 as2 = TMP_SALLOC_LIMBS (n + 1); 96 ash = TMP_SALLOC_LIMBS (n + 1); 97 asmh = TMP_SALLOC_LIMBS (n + 1); 98 99 bs1 = TMP_SALLOC_LIMBS (n + 1); 100 bsm1 = TMP_SALLOC_LIMBS (n); 101 bs2 = TMP_SALLOC_LIMBS (n + 1); 102 bsh = TMP_SALLOC_LIMBS (n + 1); 103 bsmh = TMP_SALLOC_LIMBS (n + 1); 104 105 a0_a2 = pp; 106 a1_a3 = pp + n + 1; 107 108 /* Compute as1 and asm1. */ 109 a0_a2[n] = mpn_add_n (a0_a2, a0, a2, n); 110 a0_a2[n] += mpn_add_n (a0_a2, a0_a2, a4, n); 111 a1_a3[n] = mpn_add_n (a1_a3, a1, a3, n); 112 a1_a3[n] += mpn_add (a1_a3, a1_a3, n, a5, s); 113 #if HAVE_NATIVE_mpn_addsub_n 114 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 115 { 116 mpn_addsub_n (as1, asm1, a1_a3, a0_a2, n + 1); 117 vm1_neg = 1; 118 } 119 else 120 { 121 mpn_addsub_n (as1, asm1, a0_a2, a1_a3, n + 1); 122 vm1_neg = 0; 123 } 124 #else 125 mpn_add_n (as1, a0_a2, a1_a3, n + 1); 126 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 127 { 128 mpn_sub_n (asm1, a1_a3, a0_a2, n + 1); 129 vm1_neg = 1; 130 } 131 else 132 { 133 mpn_sub_n (asm1, a0_a2, a1_a3, n + 1); 134 vm1_neg = 0; 135 } 136 #endif 137 138 /* Compute as2. */ 139 #if HAVE_NATIVE_mpn_addlsh1_n 140 cy = mpn_addlsh1_n (as2, a4, a5, s); 141 if (s != n) 142 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); 143 cy = 2 * cy + mpn_addlsh1_n (as2, a3, as2, n); 144 cy = 2 * cy + mpn_addlsh1_n (as2, a2, as2, n); 145 cy = 2 * cy + mpn_addlsh1_n (as2, a1, as2, n); 146 cy = 2 * cy + mpn_addlsh1_n (as2, a0, as2, n); 147 #else 148 cy = mpn_lshift (as2, a5, s, 1); 149 cy += mpn_add_n (as2, a4, as2, s); 150 if (s != n) 151 cy = mpn_add_1 (as2 + s, a4 + s, n - s, cy); 152 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 153 cy += mpn_add_n (as2, a3, as2, n); 154 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 155 cy += mpn_add_n (as2, a2, as2, n); 156 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 157 cy += mpn_add_n (as2, a1, as2, n); 158 cy = 2 * cy + mpn_lshift (as2, as2, n, 1); 159 cy += mpn_add_n (as2, a0, as2, n); 160 #endif 161 as2[n] = cy; 162 163 /* Compute ash and asmh. */ 164 #if HAVE_NATIVE_mpn_addlsh_n 165 cy = mpn_addlsh_n (a0_a2, a2, a0, n, 2); /* 4a0 + a2 */ 166 cy = 4 * cy + mpn_addlsh_n (a0_a2, a4, a0_a2, n, 2); /* 16a0 + 4a2 + a4 */ 167 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ 168 a0_a2[n] = cy; 169 cy = mpn_addlsh_n (a1_a3, a3, a1, n, 2); /* 4a1 */ 170 cy = 4 * cy + mpn_addlsh_n (a1_a3, a5, a1_a3, n, 2); /* 16a1 + 4a3 */ 171 a1_a3[n] = cy; 172 #else 173 cy = mpn_lshift (a0_a2, a0, n, 2); /* 4a0 */ 174 cy += mpn_add_n (a0_a2, a2, a0_a2, n); /* 4a0 + a2 */ 175 cy = 4 * cy + mpn_lshift (a0_a2, a0_a2, n, 2); /* 16a0 + 4a2 */ 176 cy += mpn_add_n (a0_a2, a4, a0_a2, n); /* 16a0 + 4a2 + a4 */ 177 cy = 2 * cy + mpn_lshift (a0_a2, a0_a2, n, 1); /* 32a0 + 8a2 + 2a4 */ 178 a0_a2[n] = cy; 179 cy = mpn_lshift (a1_a3, a1, n, 2); /* 4a1 */ 180 cy += mpn_add_n (a1_a3, a3, a1_a3, n); /* 4a1 + a3 */ 181 cy = 4 * cy + mpn_lshift (a1_a3, a1_a3, n, 2); /* 16a1 + 4a3 */ 182 cy += mpn_add (a1_a3, a1_a3, n, a5, s); /* 16a1 + 4a3 + a5 */ 183 a1_a3[n] = cy; 184 #endif 185 #if HAVE_NATIVE_mpn_addsub_n 186 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 187 { 188 mpn_addsub_n (ash, asmh, a1_a3, a0_a2, n + 1); 189 vmh_neg = 1; 190 } 191 else 192 { 193 mpn_addsub_n (ash, asmh, a0_a2, a1_a3, n + 1); 194 vmh_neg = 0; 195 } 196 #else 197 mpn_add_n (ash, a0_a2, a1_a3, n + 1); 198 if (mpn_cmp (a0_a2, a1_a3, n + 1) < 0) 199 { 200 mpn_sub_n (asmh, a1_a3, a0_a2, n + 1); 201 vmh_neg = 1; 202 } 203 else 204 { 205 mpn_sub_n (asmh, a0_a2, a1_a3, n + 1); 206 vmh_neg = 0; 207 } 208 #endif 209 210 /* Compute bs1 and bsm1. */ 211 if (t == n) 212 { 213 #if HAVE_NATIVE_mpn_addsub_n 214 if (mpn_cmp (b0, b1, n) < 0) 215 { 216 cy = mpn_addsub_n (bs1, bsm1, b1, b0, n); 217 bsm_neg = 1; 218 } 219 else 220 { 221 cy = mpn_addsub_n (bs1, bsm1, b0, b1, n); 222 bsm_neg = 0; 223 } 224 bs1[n] = cy >> 1; 225 #else 226 bs1[n] = mpn_add_n (bs1, b0, b1, n); 227 if (mpn_cmp (b0, b1, n) < 0) 228 { 229 mpn_sub_n (bsm1, b1, b0, n); 230 bsm_neg = 1; 231 } 232 else 233 { 234 mpn_sub_n (bsm1, b0, b1, n); 235 bsm_neg = 0; 236 } 237 #endif 238 } 239 else 240 { 241 bs1[n] = mpn_add (bs1, b0, n, b1, t); 242 if (mpn_zero_p (b0 + t, n - t) && mpn_cmp (b0, b1, t) < 0) 243 { 244 mpn_sub_n (bsm1, b1, b0, t); 245 MPN_ZERO (bsm1 + t, n - t); 246 bsm_neg = 1; 247 } 248 else 249 { 250 mpn_sub (bsm1, b0, n, b1, t); 251 bsm_neg = 0; 252 } 253 } 254 255 vm1_neg ^= bsm_neg; 256 257 /* Compute bs2, recycling bs1. bs2=bs1+b1 */ 258 mpn_add (bs2, bs1, n + 1, b1, t); 259 260 /* Compute bsh and bsmh, recycling bs1 and bsm1. bsh=bs1+b0; bsmh=bsmh+b0 */ 261 if (bsm_neg == 1) 262 { 263 bsmh[n] = 0; 264 if (mpn_cmp (bsm1, b0, n) < 0) 265 { 266 bsm_neg = 0; 267 mpn_sub_n (bsmh, b0, bsm1, n); 268 } 269 else 270 mpn_sub_n (bsmh, bsm1, b0, n); 271 } 272 else 273 bsmh[n] = mpn_add_n (bsmh, bsm1, b0, n); 274 mpn_add (bsh, bs1, n + 1, b0, n); 275 vmh_neg ^= bsm_neg; 276 277 278 ASSERT (as1[n] <= 5); 279 ASSERT (bs1[n] <= 1); 280 ASSERT (asm1[n] <= 2); 281 /*ASSERT (bsm1[n] == 0);*/ 282 ASSERT (as2[n] <= 62); 283 ASSERT (bs2[n] <= 2); 284 ASSERT (ash[n] <= 62); 285 ASSERT (bsh[n] <= 2); 286 ASSERT (asmh[n] <= 41); 287 ASSERT (bsmh[n] <= 1); 288 289 #define v0 pp /* 2n */ 290 #define v1 (scratch + 6 * n + 6) /* 2n+1 */ 291 #define vinf (pp + 6 * n) /* s+t */ 292 #define vm1 scratch /* 2n+1 */ 293 #define v2 (scratch + 2 * n + 2) /* 2n+1 */ 294 #define vh (pp + 2 * n) /* 2n+1 */ 295 #define vmh (scratch + 4 * n + 4) 296 297 /* vm1, 2n+1 limbs */ 298 mpn_mul_n (vm1, asm1, bsm1, n); 299 cy = 0; 300 if (asm1[n] == 1) 301 { 302 cy = mpn_add_n (vm1 + n, vm1 + n, bsm1, n); 303 } 304 else if (asm1[n] == 2) 305 { 306 #if HAVE_NATIVE_mpn_addlsh1_n 307 cy = mpn_addlsh1_n (vm1 + n, vm1 + n, bsm1, n); 308 #else 309 cy = mpn_addmul_1 (vm1 + n, bsm1, n, CNST_LIMB(2)); 310 #endif 311 } 312 vm1[2 * n] = cy; 313 314 mpn_mul_n (v2, as2, bs2, n + 1); /* v2, 2n+1 limbs */ 315 316 /* vinf, s+t limbs */ 317 if (s > t) mpn_mul (vinf, a5, s, b1, t); 318 else mpn_mul (vinf, b1, t, a5, s); 319 320 /* v1, 2n+1 limbs */ 321 mpn_mul_n (v1, as1, bs1, n); 322 if (as1[n] == 1) 323 { 324 cy = bs1[n] + mpn_add_n (v1 + n, v1 + n, bs1, n); 325 } 326 else if (as1[n] == 2) 327 { 328 #if HAVE_NATIVE_mpn_addlsh1_n 329 cy = 2 * bs1[n] + mpn_addlsh1_n (v1 + n, v1 + n, bs1, n); 330 #else 331 cy = 2 * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, CNST_LIMB(2)); 332 #endif 333 } 334 else if (as1[n] != 0) 335 { 336 cy = as1[n] * bs1[n] + mpn_addmul_1 (v1 + n, bs1, n, as1[n]); 337 } 338 else 339 cy = 0; 340 if (bs1[n] != 0) 341 cy += mpn_add_n (v1 + n, v1 + n, as1, n); 342 v1[2 * n] = cy; 343 344 mpn_mul_n (vh, ash, bsh, n + 1); 345 346 mpn_mul_n (vmh, asmh, bsmh, n + 1); 347 348 mpn_mul_n (v0, ap, bp, n); /* v0, 2n limbs */ 349 350 flags = vm1_neg ? toom4_w3_neg : 0; 351 flags |= vmh_neg ? toom4_w1_neg : 0; 352 353 mpn_toom_interpolate_7pts (pp, n, flags, vmh, vm1, v1, v2, s + t, scratch + 8 * n + 8); 354 355 TMP_FREE; 356 } 357